### 1. Text search across the sentence using Regular expression

In [1]:
# Text search across the sentence using Regular expression
import re
words = ['very','nice','lecture','day','moon']
expression = '|'.join(words)

re.findall(expression, 'i attended a very nice lecture last year', re.M)

['very', 'nice', 'lecture']

### 2. Text to List

In [4]:
text_file = 'data.txt'

In [5]:
# Method-1 : Individual words as separate elements of the list
with open(text_file) as f:
    words = f.read().split()
    
print(words)

['Are', 'you', 'sure', 'moving', 'ahead', 'on', 'this', 'route', 'is', 'the', 'right', 'thing?']


In [7]:
# Method-2 : Whole text as single element of the list
f = open(text_file , 'r')
words_ = f.readlines()

print(words_)

['Are you sure moving ahead on this route is the right thing?\n']


### 3. Preprocessing the text

In [30]:
sentence = 'John has been selected for the trial phase this time. Congrats!!'
sentence=sentence.lower()

# defining the positive and negative words explicitly
positive_words=['awesome','good', 'nice', 'super', 'fun', 'delightful','congrats']
negative_words=['awful','lame','horrible','bad']

In [31]:
sentence=sentence.replace('!','')
sentence

'john has been selected for the trial phase this time. congrats'

In [35]:
words= sentence.split(' ')
print(words)

['john', 'has', 'been', 'selected', 'for', 'the', 'trial', 'phase', 'this', 'time.', 'congrats']


In [36]:
result= set(words)-set(positive_words)
print(result)

{'has', 'phase', 'for', 'time.', 'trial', 'been', 'john', 'the', 'this', 'selected'}


### 4. Accessing Text from Web

In [28]:
# Make sure both the packages are installed
import urllib3
from bs4 import BeautifulSoup

pool_object = urllib3.PoolManager()
target_url = 'http://www.gutenberg.org/files/2554/2554-h/2554-h.htm#link2HCH0008'
response_ = pool_object.request('GET', target_url)
final_html_txt = BeautifulSoup(response_.data)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [37]:
print(final_html_txt)

<?xml version="1.0" encoding="utf-8"?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>
      Crime and Punishment, by Fyodor Dostoevsky
    </title>
<style type="text/css" xml:space="preserve">

    body { margin:5%; background:#faebd0; text-align:justify}
    P { text-indent: 1em; margin-top: .25em; margin-bottom: .25em; }
    H1,H2,H3,H4,H5,H6 { text-align: center; margin-left: 15%; margin-right: 15%; }
    hr  { width: 50%; text-align: center;}
    .foot { margin-left: 20%; margin-right: 20%; text-align: justify; text-indent: -3em; font-size: 90%; }
    blockquote {font-size: 97%; font-style: italic; margin-left: 10%; margin-right: 10%;}
    .mynote    {background-color: #DDE; color: #000; padding: .5em; margin-left: 10%; margin-right: 10%; font-family: sans-serif; font-size: 95%;}
    .toc       { margin-left: 10%; margin-bottom: .75em;}
    

### 5. Removal of Stopwords

In [15]:
import nltk
from nltk import word_tokenize

sentence= "This book is about Deep Learning and Natural Language Processing!"
tokens = word_tokenize(sentence)
print(tokens)

['This', 'book', 'is', 'about', 'Deep', 'Learning', 'and', 'Natural', 'Language', 'Processing', '!']


In [19]:
# import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
new_tokens = [w for w in tokens if not w in stop_words]
new_tokens

['This', 'book', 'Deep', 'Learning', 'Natural', 'Language', 'Processing', '!']

### 6. Counter Vectorization

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
texts=["Ramiess sings classic songs","he listen to old pop ","and rock music", ' and also listens classical songs']
cv = CountVectorizer()
cv_fit=cv.fit_transform(texts)

print(cv.get_feature_names())
print(cv_fit.toarray())

['also', 'and', 'classic', 'classical', 'he', 'listen', 'listens', 'music', 'old', 'pop', 'ramiess', 'rock', 'sings', 'songs', 'to']
[[0 0 1 0 0 0 0 0 0 0 1 0 1 1 0]
 [0 0 0 0 1 1 0 0 1 1 0 0 0 0 1]
 [0 1 0 0 0 0 0 1 0 0 0 1 0 0 0]
 [1 1 0 1 0 0 1 0 0 0 0 0 0 1 0]]


### 7. TF-IDF Score

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts=["Ramiess sings classic songs","he listen to old pop ","and rock music", ' and also listens classical songs']
vect = TfidfVectorizer()
X = vect.fit_transform(texts)

print(X.todense())

[[ 0.          0.          0.52547275  0.          0.          0.          0.
   0.          0.          0.          0.52547275  0.          0.52547275
   0.41428875  0.        ]
 [ 0.          0.          0.          0.          0.4472136   0.4472136
   0.          0.          0.4472136   0.4472136   0.          0.          0.
   0.          0.4472136 ]
 [ 0.          0.48693426  0.          0.          0.          0.          0.
   0.61761437  0.          0.          0.          0.61761437  0.          0.
   0.        ]
 [ 0.48546061  0.38274272  0.          0.48546061  0.          0.
   0.48546061  0.          0.          0.          0.          0.          0.
   0.38274272  0.        ]]


### 8. Text Classifier

In [8]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
data = [
 ('I love my country.', 'pos'),
 ('This is an amazing place!', 'pos'),
 ('I do not like smell of this place.', 'neg'),
 ('I do not like this restaurant', 'neg'),
 ('I am tired of hearing your nonsense.', 'neg'),
 ("I always aspire to be like him", 'pos'),
 ("It's a horrible performance.", "neg")
 ]
model = NaiveBayesClassifier(data)

model.classify("It's an awesome place!")

'pos'