In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Lowercasing

In [4]:
df['review'][3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [5]:
df['review'][3].lower()

"basically there's a family where a little boy (jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />this movie is slower than a soap opera... and suddenly, jake decides to become rambo and kill the zombie.<br /><br />ok, first of all when you're going to make a film you must decide if its a thriller or a drama! as a drama the movie is watchable. parents are divorcing & arguing like in real life. and then we have jake with his closet which totally ruins all the film! i expected to see a boogeyman similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. as for the shots with jake: just ignore them."

In [6]:
df['review'] = df['review'].str.lower()
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. <br /><br />the...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

## Remove HTML Tags

In [7]:
import re

In [8]:
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

In [9]:
text = "<html><head><title>Sample HTML Document</title></head><body><h1>Welcome to My Website</h1><p>This is a sample HTML document with <strong>bold</strong> and <em>italic</em> text.</p><ul><li>Item 1</li><li>Item 2</li><li>Item 3</li></ul></body></html>"

In [10]:
remove_html_tags(text)

'Sample HTML DocumentWelcome to My WebsiteThis is a sample HTML document with bold and italic text.Item 1Item 2Item 3'

In [11]:
df['review'] = df['review'].apply(remove_html_tags)
df['review']

0        one of the other reviewers has mentioned that ...
1        a wonderful little production. the filming tec...
2        i thought this was a wonderful way to spend ti...
3        basically there's a family where a little boy ...
4        petter mattei's "love in the time of money" is...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot, bad dialogue, bad acting, idiotic di...
49997    i am a catholic taught in parochial elementary...
49998    i'm going to have to disagree with the previou...
49999    no one expects the star trek movies to be high...
Name: review, Length: 50000, dtype: object

## Remove URLs

In [12]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r' ', text)

In [13]:
text1 = "First Link https://www.example.com"
text2 = "Second Link https://www.samplewebsite.org"
text3 = "Third Link https://blog.example.net/article123"
text4 = "Fourth Link https://www.testsite.com/page?query=example"

In [14]:
remove_url(text1)

'First Link  '

In [15]:
remove_url(text3)

'Third Link  '

## Removing Punctutations

In [16]:
import string, time

In [17]:
exclude = string.punctuation

In [18]:
def remove_punc(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

In [19]:
text = "JKdu-dsjkddagf?suyge&^sdj1234rjksfu89%$"
remove_punc(text)

'JKdudsjkddagfsuygesdj1234rjksfu89'

In [20]:
# this code takes a lot of time to process

start = time.time()
print(remove_punc(text))
time1 = time.time() - start
print(time1)

JKdudsjkddagfsuygesdj1234rjksfu89
0.0


In [21]:
# preferred code for removal of punctuations
# faster than remove_punc

def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [22]:
start = time.time()
print(remove_punc1(text))
time2 = time.time() - start
print(time2)

JKdudsjkddagfsuygesdj1234rjksfu89
0.0


## Chat Word Treatment

In [23]:
import csv
import re

def chat_word_treatment(user_string):
    user_string = user_string.split(" ")
    j = 0
    for _str in user_string:
        fileName = "slang.txt"
        accessMode = "r"
        with open(fileName, accessMode) as myCSVfile:
            dataFromFile = csv.reader(myCSVfile, delimiter="=")
            _str = re.sub('[^a-zA-Z0-9-_.]', '', _str)
            for row in dataFromFile:
                if _str.upper() == row[0]:
                    user_string[j] = row[1]
            myCSVfile.close()
        j = j + 1
    print(' '.join(user_string))

In [24]:
text = 'I will fyi'
chat_word_treatment(text)

I will For Your Information


## Spelling Correction

from textblob import TextBlob

text = 'ceertain conditionas duriing seveal'
txtBlb = TextBlob(text)
txtBlb.correct().string

## Removing Stop Words

In [25]:
from nltk.corpus import stopwords



In [26]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [27]:
stopwords.words('spanish')

['de',
 'la',
 'que',
 'el',
 'en',
 'y',
 'a',
 'los',
 'del',
 'se',
 'las',
 'por',
 'un',
 'para',
 'con',
 'no',
 'una',
 'su',
 'al',
 'lo',
 'como',
 'más',
 'pero',
 'sus',
 'le',
 'ya',
 'o',
 'este',
 'sí',
 'porque',
 'esta',
 'entre',
 'cuando',
 'muy',
 'sin',
 'sobre',
 'también',
 'me',
 'hasta',
 'hay',
 'donde',
 'quien',
 'desde',
 'todo',
 'nos',
 'durante',
 'todos',
 'uno',
 'les',
 'ni',
 'contra',
 'otros',
 'ese',
 'eso',
 'ante',
 'ellos',
 'e',
 'esto',
 'mí',
 'antes',
 'algunos',
 'qué',
 'unos',
 'yo',
 'otro',
 'otras',
 'otra',
 'él',
 'tanto',
 'esa',
 'estos',
 'mucho',
 'quienes',
 'nada',
 'muchos',
 'cual',
 'poco',
 'ella',
 'estar',
 'estas',
 'algunas',
 'algo',
 'nosotros',
 'mi',
 'mis',
 'tú',
 'te',
 'ti',
 'tu',
 'tus',
 'ellas',
 'nosotras',
 'vosotros',
 'vosotras',
 'os',
 'mío',
 'mía',
 'míos',
 'mías',
 'tuyo',
 'tuya',
 'tuyos',
 'tuyas',
 'suyo',
 'suya',
 'suyos',
 'suyas',
 'nuestro',
 'nuestra',
 'nuestros',
 'nuestras',
 'vuestro'

In [28]:
def remove_stopwords(text):
    new_text = []
    
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

## Tokenization

### 1. Using split Function

In [29]:
# word tokenization
sent1 = 'I am going to Bhopal'
sent1.split()

['I', 'am', 'going', 'to', 'Bhopal']

In [30]:
#sentence tokenization
sent2 = 'I am going to Bhopal. It is a beautiful city.'
sent2.split('.')

['I am going to Bhopal', ' It is a beautiful city', '']

In [31]:
# problem with split function - 1
sent3 = 'I am going to Bhopal!'
sent3.split()

['I', 'am', 'going', 'to', 'Bhopal!']

In [32]:
# problem with split function - 2
sent4 = 'I am going to Bhopal! It is going to be a great trip!'
sent4.split('.')

['I am going to Bhopal! It is going to be a great trip!']

### 2. Regular Expression

In [33]:
import re
sent5 = 'I am going to be in Bhopal!'
tokens = re.findall("[\w]+", sent5)
tokens

['I', 'am', 'going', 'to', 'be', 'in', 'Bhopal']

In [34]:
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""
sentences = re.compile('[.!?] ').split(text)
sentences

['Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua',
 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat',
 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur',
 'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.']

### 3. NLTK

In [35]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [36]:
# sentence tokenization
text = """Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."""
sent_tokenize(text)

['Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.',
 'Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.',
 'Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.',
 'Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.']

In [37]:
# word tokenize
text = "I am going to Bhopal! My mail ID is sk.daui@gmail.com"
word_tokenize(text)

['I',
 'am',
 'going',
 'to',
 'Bhopal',
 '!',
 'My',
 'mail',
 'ID',
 'is',
 'sk.daui',
 '@',
 'gmail.com']

### 4. Spacy

In [40]:
import spacy

ModuleNotFoundError: No module named 'en_core_web_sm'

In [43]:
# nlp = spacy.load("en_core_web_sm")

In [42]:
sent = "I've a Ph.D in A.I."
# doc = nlp(sent)

In [None]:
# for token in doc:
#     print(token)

## Stemming

In [44]:
from nltk.stem.porter import PorterStemmer

In [45]:
ps = PorterStemmer()
def stem_words(text):
    return ' '.join([ps.stem(word) for word in text.split()])

In [47]:
sample = 'walk walks walking walked'
stem_words(sample)

'walk walk walk walk'

In [48]:
text = "Football, also known as soccer in some regions, stands as the world's most popular and widely followed sport, captivating the hearts of millions globally. Played on vast green fields, the game is a dynamic display of skill, strategy, and teamwork. Teams of eleven players each compete to score goals by propelling a ball into the opponent's net, using any part of their bodies except their hands. Beyond the exhilarating matches, football fosters a sense of community and passion, creating a shared language that transcends cultural and geographical boundaries. With iconic tournaments like the FIFA World Cup uniting nations in celebration, football goes beyond being just a sport; it becomes a cultural phenomenon, weaving stories of triumph, rivalry, and camaraderie on a global stage."
print(text)

Football, also known as soccer in some regions, stands as the world's most popular and widely followed sport, captivating the hearts of millions globally. Played on vast green fields, the game is a dynamic display of skill, strategy, and teamwork. Teams of eleven players each compete to score goals by propelling a ball into the opponent's net, using any part of their bodies except their hands. Beyond the exhilarating matches, football fosters a sense of community and passion, creating a shared language that transcends cultural and geographical boundaries. With iconic tournaments like the FIFA World Cup uniting nations in celebration, football goes beyond being just a sport; it becomes a cultural phenomenon, weaving stories of triumph, rivalry, and camaraderie on a global stage.


In [49]:
stem_words(text)

"football, also known as soccer in some regions, stand as the world' most popular and wide follow sport, captiv the heart of million globally. play on vast green fields, the game is a dynam display of skill, strategy, and teamwork. team of eleven player each compet to score goal by propel a ball into the opponent' net, use ani part of their bodi except their hands. beyond the exhilar matches, footbal foster a sens of commun and passion, creat a share languag that transcend cultur and geograph boundaries. with icon tournament like the fifa world cup unit nation in celebration, footbal goe beyond be just a sport; it becom a cultur phenomenon, weav stori of triumph, rivalry, and camaraderi on a global stage."

## Lemmatization

In [52]:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [54]:
sentence = "He was running and eating at the same time. He has bad habit of swimming after playing long hours in the Sun."
punctuations = '?:!.,:'
sentence_words = nltk.word_tokenize(sentence)
for word in sentence_words:
    if word in punctuations:
        sentence_words.remove(word)
        
sentence_words
print("{0:20}{1:20}".format('Word', "Lemma"))
for word in sentence_words:
    print("{0:20}{1:20}".format(word, wordnet_lemmatizer.lemmatize(word)))

Word                Lemma               
He                  He                  
was                 wa                  
running             running             
and                 and                 
eating              eating              
at                  at                  
the                 the                 
same                same                
time                time                
He                  He                  
has                 ha                  
bad                 bad                 
habit               habit               
of                  of                  
swimming            swimming            
after               after               
playing             playing             
long                long                
hours               hour                
in                  in                  
the                 the                 
Sun                 Sun                 
