In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
data = pd.read_csv('../artifacts/sentiment_analysis.csv')

In [4]:
data

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [5]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [6]:
# data preprossesing
data.shape

(7920, 3)

In [7]:
#check duplicates
data.duplicated().sum()

0

In [8]:
# check for null 
data.isnull().sum()

id       0
label    0
tweet    0
dtype: int64

In [9]:
# =============================Text preprocessing========================
#1 convert ussercase to lower case
#2 remove links
#3 remove Punctuations !?...
#4 remove numbers
#5remove stopwords (and or but)
#6 remove stemming(creating,created,creates==> create) like this take base word

In [10]:
# regular expression
import re
import string

In [11]:
# convert to lover case
data['tweet'] = data['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [12]:
data['tweet'].head(5)

0    #fingerprint #pregnancy test https://goo.gl/h1...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

# remove links


In [13]:
data["tweet"] = data['tweet'].apply(lambda x: " ".join(re.sub(r'^https?:\/\/.*[\r\n]*', '', x, flags=re.MULTILINE) for x in x.split()))

In [14]:
data['tweet'].head()

0    #fingerprint #pregnancy test  #android #apps #...
1    finally a transparant silicon case ^^ thanks t...
2    we love this! would you go? #talk #makememorie...
3    i'm wired i know i'm george i was made that wa...
4    what amazing service! apple won't even talk to...
Name: tweet, dtype: object

In [47]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

# remove puncuation

In [51]:
def remove_punctuation(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation,' ')
    return text
data["tweet"] = data["tweet"].apply(remove_punctuation)

In [52]:
data['tweet'].head(5)

0     fingerprint  pregnanc test  android  app  bea...
1    final transpar silicon case    thank uncl     ...
2    love would go   talk  makememori  unplug  rela...
3    i m wire know i m georg made way     iphon  cu...
4    amaz servic appl even talk question unless pay...
Name: tweet, dtype: object

In [53]:
data['tweet'].tail()

7915    live loud  lol  liveoutloud  selfi  smile  son...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got  smart  pocket  wifi stay connect an...
7919    appl barcelona  appl  store  bcn  barcelona  t...
Name: tweet, dtype: object

# By adding the r before the string, you tell Python to treat the string as a raw string, # which will correctly interpret the regex pattern.
#  \d+ is interpreted as a regex pattern matching one or more digits.
# removing numbers


In [54]:
data['tweet'] = data['tweet'].str.replace(r'\d+','',regex=True)

In [55]:
data['tweet'].tail()

7915    live loud  lol  liveoutloud  selfi  smile  son...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got  smart  pocket  wifi stay connect an...
7919    appl barcelona  appl  store  bcn  barcelona  t...
Name: tweet, dtype: object

# remove stop words
# we can take those stop words from NLTK library


In [23]:
!pip install nltk
import nltk



In [27]:
# here we are downoding all the stop words for all langauges here we need to choose only englsih for our case
nltk.download('stopwords',download_dir='../static/model')

[nltk_data] Downloading package stopwords to ../static/model...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
with open('../static/model/corpora/stopwords/english','r') as file:
    sw = file.read().splitlines()

In [29]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [58]:
# extracting wihout stop words
data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [57]:
data['tweet']

0        fingerprint  pregnanc test  android  app  bea...
1       final transpar silicon case    thank uncl     ...
2       love would go   talk  makememori  unplug  rela...
3       i m wire know i m georg made way     iphon  cu...
4       amaz servic appl even talk question unless pay...
                              ...                        
7915    live loud  lol  liveoutloud  selfi  smile  son...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got  smart  pocket  wifi stay connect an...
7919    appl barcelona  appl  store  bcn  barcelona  t...
Name: tweet, Length: 7920, dtype: object

# stemming
# Getting the base word 
# eg: if there is a word 'finally' instead of finally getting final from it
# We can do this from nltk

In [59]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [60]:
data['tweet'] = data['tweet'].apply(lambda x: " ".join(ps.stem(x) for x in x.split()))
# here wea are putting all words of tweet column tbrough stem() menthod and stem methou gives base word

In [61]:
data['tweet'].head()

0    fingerprint pregnanc test android app beauti c...
1    final transpar silicon case thank uncl yay son...
2    love would go talk makememori unplug relax iph...
3    wire know georg made way iphon cute daventri home
4    amaz servic appl even talk question unless pay...
Name: tweet, dtype: object

# We need to turn those into numerical values because of ML model unable to process text values
# We need to do vectorization for text preprocessed words
