Dataset: https://www.kaggle.com/datasets/kazanova/sentiment140

In [1]:
# pip install -U spacy
# pip install -U spacy-lookups-data
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_md
# python -m spacy download en_core_web_lg

### General Feature Extraction
   - File loading
   - Word counts
   - Characters count
   - Average characters per word
   - Stop words count
   - Count #Hash Tags and @Mentions
   - if Numeric digits are present in twits
   - Upper case word counts



###  Preprocessing and Cleaning
   -  Lower case
   -  Contraction to Expansion
   -  Email removal and counts
   -  URLs removal and counts
   -  Removal of RT
   -  Removal of Special Characters
   -  Removal of multiple spaces
   -  Removal of HTML tags
   -  Removal of accented characters
   -  Removal of stop Words
   -  Conversion into base form of words
   -  Common Occurring words Removal
   -  Rare Occurring words Removal
   -  Word Cloud
   -  Spelling Correction
   -  Tokenization 
   -  Lemmatization
   -  Detecting Entities using NER
   -  Noun Detection
   -  Sentence Translation
   -  Using inbuilt `Sentiment Classifier`

### Advanced Text Processing and Feature Extraction
   - N-Gram, Bi-Gram etc
   - Bag of Words (BoW)
   - Term Frequency Calculation <mark>TF</mark>
   - Inverse Document Frequency <mark>IDF</mark>
   - `TF-IDF` Term Frequency-Inverse Document Frequency    
   - Word Embedding `Word2Vec` using Spacy

### Machine Learning Models for Text Classification
   - SGDClassifier
   - LogisticRegression
   - LogisticRegressionCV
   - LinearSVC
   - RandomForestClassifier

In [2]:
import numpy as np
import pandas as pd

In [3]:
pd.set_option('display.max_colwidth', None)

In [4]:
df = pd.read_csv('twitter16m.csv', encoding='latin1', header=None )

In [5]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there."


In [6]:
df = df[[5,0]]
df.columns = ['tweets', 'sentiment']

In [7]:
df.head()

Unnamed: 0,tweets,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0


In [8]:
# count unique value of a column
df['sentiment'].value_counts()

0    800000
4    800000
Name: sentiment, dtype: int64

In [9]:
sent_map = {0: 'negative', 4:'positive'}

##### Word counts

In [10]:
# lambda function take input row by row in 'x' variable.
# some value may have numeric value that's why we use 'str()'
df['word_counts'] = df['tweets'].apply(lambda x: len(str(x).split())) 

In [11]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0,19
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0,21
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0,18
3,my whole body feels itchy and like its on fire,0,10
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0,21


##### Characters count

In [12]:
df['char_counts'] = df['tweets'].apply(lambda x: len(x))

In [13]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0,19,115
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0,21,111
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0,18,89
3,my whole body feels itchy and like its on fire,0,10,47
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0,21,111


##### Average characters per word ( average word length)

In [14]:
def get_average_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len += len(word)
    return word_len/len(words) # != len(x)/len(words)

In [15]:
df['avg_word_len'] = df['tweets'].apply(lambda x: get_average_word_len(x))

In [16]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0,19,115,5.052632
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0,21,111,4.285714
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0,18,89,3.944444
3,my whole body feels itchy and like its on fire,0,10,47,3.7
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0,21,111,4.285714


##### Stop words count

In [17]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

2022-09-28 12:16:04.493173: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-09-28 12:16:07.749946: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-09-28 12:16:07.750151: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-09-28 12:16:07.750160: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-09-28 12:16:07.750175: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (arafat-pc): /proc/driver/nvidia/version does not exist


In [18]:
# for finding the semantic meaning of a sentense, stopword is important.
# before delete stop word, think what you want to do.
print(STOP_WORDS)

{'has', 'most', 'noone', 'becoming', 'to', 'it', 'will', 'her', 'no', 'been', 'then', 'back', 'unless', 'though', 'say', 'over', 'mostly', 'on', 'via', 'sixty', 'ourselves', 'nine', 'as', 'ca', 'once', 'nor', 'down', 'another', 'do', 'more', 'by', 'thereafter', 'used', 'yourselves', 'together', 'and', 'could', 'please', 'i', 'our', 'beside', 'namely', 'get', '‘s', 'whatever', 'those', 'ever', 'several', 'who', 'much', 'other', 'are', 'already', "'s", 'hundred', 'therein', 'many', 'your', 'using', 'might', 'therefore', 'done', 'after', 'well', 'about', 'very', 'show', 'various', '’m', 'everywhere', 'ten', 'their', 'because', 'forty', 'one', 'n’t', 'last', 'neither', 'whose', 'he', 'made', 'twenty', 'across', 'perhaps', 'themselves', 'hers', 'would', 'although', 'toward', 'anything', 'front', 'from', 'n‘t', 'either', 'have', 'an', 'there', 'again', 'through', 'such', 'why', 'two', 'whenever', 'here', "'m", 'make', 'amongst', 'eight', 'nowhere', 'four', 'before', '’d', 'somehow', 'within'

In [19]:
# example of creating a list that's contain stopwords and find the len of that list.
x = 'my name is arafat'
[word for word in x.split() if word in STOP_WORDS]

['my', 'name', 'is']

In [20]:
len([word for word in x.split() if word in STOP_WORDS])

3

In [21]:
df['stop_word_len'] =  df['tweets'].apply(lambda x: len([word for word in x.split() if word in STOP_WORDS]))

In [22]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0,19,115,5.052632,4
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0,21,111,4.285714,9
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0,18,89,3.944444,7
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0,21,111,4.285714,10


##### Count #Hash Tags and @Mentions

In [23]:
# EX:
x = 'this is #hashtag and this is @mention'
x.split()

['this', 'is', '#hashtag', 'and', 'this', 'is', '@mention']

In [24]:
# string.startswith(value, start, end)
li = [word for word in x.split() if word.startswith('#')]
li

['#hashtag']

In [25]:
li = [word for word in x.split() if word.startswith(('#','@'))]
li

['#hashtag', '@mention']

In [26]:
li = [word for word in x.split() if word.startswith(('#','@'),0,3)]
li

['#hashtag', '@mention']

In [27]:
df['#_counts'] = df['tweets'].apply(lambda x: len([word for word in x.split() if word.startswith('#')]))
df['@_counts'] = df['tweets'].apply(lambda x: len([word for word in x.split() if word.startswith('@')]))

In [28]:
df.head()
#30 min

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0,19,115,5.052632,4,0,1
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0,21,111,4.285714,9,0,0
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0,18,89,3.944444,7,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0,21,111,4.285714,10,0,1


##### if Numeric digits are present in twits

In [29]:
df['numeric_counts'] = df['tweets'].apply(lambda x: len([word for word in x.split() if word.isdigit()]))

In [30]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0,19,115,5.052632,4,0,1,0
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0,21,111,4.285714,9,0,0,0
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0,18,89,3.944444,7,0,1,0
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0,21,111,4.285714,10,0,1,0


##### Upper case word counts

In [31]:
df['upper_counts'] = df['tweets'].apply(lambda x: len([word for word in x.split() if word.isupper() and len(x)>3 ]))

In [32]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts,upper_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D",0,19,115,5.052632,4,0,1,0,1
1,is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!,0,21,111,4.285714,9,0,0,0,0
2,@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds,0,18,89,3.944444,7,0,1,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.",0,21,111,4.285714,10,0,1,0,1


In [33]:
df.loc[96]['tweets']

"so rylee,grace...wana go steve's party or not?? SADLY SINCE ITS EASTER I WNT B ABLE 2 DO MUCH  BUT OHH WELL....."

In [34]:
df.loc[96]

tweets            so rylee,grace...wana go steve's party or not?? SADLY SINCE ITS EASTER I WNT B ABLE 2 DO MUCH  BUT OHH WELL.....
sentiment                                                                                                                        0
word_counts                                                                                                                     21
char_counts                                                                                                                    112
avg_word_len                                                                                                              4.333333
stop_word_len                                                                                                                    3
#_counts                                                                                                                         0
@_counts                                                                           

# Preprocessing and Cleaning


##### Lower case

In [35]:
df['tweets'] = df['tweets'].apply(lambda x: x.lower())

In [36]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts,upper_counts
0,"@switchfoot http://twitpic.com/2y1zl - awww, that's a bummer. you shoulda got david carr of third day to do it. ;d",0,19,115,5.052632,4,0,1,0,1
1,is upset that he can't update his facebook by texting it... and might cry as a result school today also. blah!,0,21,111,4.285714,9,0,0,0,0
2,@kenichan i dived many times for the ball. managed to save 50% the rest go out of bounds,0,18,89,3.944444,7,0,1,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because i can't see you all over there.",0,21,111,4.285714,10,0,1,0,1


##### Contraction to Expansion

In [37]:
# don' = do not
x = "i don't know what you want, can't, he'll, i'd"
# 40 min

##### Email removal and counts

In [38]:
import re

In [39]:
x = 'hi, email me at arafat6462@gmail.com, you can also find me on 18-37576-1@student.aiub.edu'

In [40]:
re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)',x)

['arafat6462@gmail.com', '18-37576-1@student.aiub.edu']

In [41]:
%%time
df['email'] = df['tweets'].apply(lambda x: re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)',x))

CPU times: user 4.98 s, sys: 43.7 ms, total: 5.03 s
Wall time: 5.03 s


In [42]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts,upper_counts,email
0,"@switchfoot http://twitpic.com/2y1zl - awww, that's a bummer. you shoulda got david carr of third day to do it. ;d",0,19,115,5.052632,4,0,1,0,1,[]
1,is upset that he can't update his facebook by texting it... and might cry as a result school today also. blah!,0,21,111,4.285714,9,0,0,0,0,[]
2,@kenichan i dived many times for the ball. managed to save 50% the rest go out of bounds,0,18,89,3.944444,7,0,1,0,1,[]
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0,[]
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because i can't see you all over there.",0,21,111,4.285714,10,0,1,0,1,[]


In [43]:
df['email_counts'] = df['email'].apply(lambda x: len(x))

In [44]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts,upper_counts,email,email_counts
0,"@switchfoot http://twitpic.com/2y1zl - awww, that's a bummer. you shoulda got david carr of third day to do it. ;d",0,19,115,5.052632,4,0,1,0,1,[],0
1,is upset that he can't update his facebook by texting it... and might cry as a result school today also. blah!,0,21,111,4.285714,9,0,0,0,0,[],0
2,@kenichan i dived many times for the ball. managed to save 50% the rest go out of bounds,0,18,89,3.944444,7,0,1,0,1,[],0
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0,[],0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because i can't see you all over there.",0,21,111,4.285714,10,0,1,0,1,[],0


In [45]:
# find which tweets has mail
df[df['email_counts']>0].head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts,upper_counts,email,email_counts
4054,i want a new laptop. hp tx2000 is the bomb. :| who knows how much it is? im me: gabbehhramos@yahoo.com,0,20,103,4.15,6,0,0,0,4,[gabbehhramos@yahoo.com],1
7917,who stole elledell@gmail.com?,0,3,31,9.0,1,0,0,0,0,[elledell@gmail.com],1
8496,@alexistehpom really? did you send out all the info already? if you did..maybe you could just email me stuff missataari@gmail.com,0,20,130,5.5,11,0,1,0,0,[missataari@gmail.com],1
10290,@laureystack awh...that's kinda sad lol add me?? hello.kitty.65@hotmail.com,0,8,76,8.5,0,0,1,0,0,[hello.kitty.65@hotmail.com],1
16413,"@jilliancyork got 2 bottom of it, human error bug from a release last month, being fixed tonight. email press@linkedin.com for details",0,21,137,5.428571,7,0,1,1,0,[press@linkedin.com],1


In [46]:
df.loc[4054] 

tweets            i want a new laptop.  hp tx2000 is the bomb. :| who knows how much it is? im me: gabbehhramos@yahoo.com
sentiment                                                                                                               0
word_counts                                                                                                            20
char_counts                                                                                                           103
avg_word_len                                                                                                         4.15
stop_word_len                                                                                                           6
#_counts                                                                                                                0
@_counts                                                                                                                0
numeric_counts          

In [47]:
# remove email
df['tweets'] = df['tweets'].apply(lambda x: re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)', '' ,x))

In [48]:
df.loc[4054] 

tweets            i want a new laptop.  hp tx2000 is the bomb. :| who knows how much it is? im me: 
sentiment                                                                                         0
word_counts                                                                                      20
char_counts                                                                                     103
avg_word_len                                                                                   4.15
stop_word_len                                                                                     6
#_counts                                                                                          0
@_counts                                                                                          0
numeric_counts                                                                                    0
upper_counts                                                                                      4


In [49]:
df[df['email_counts']>0].head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts,upper_counts,email,email_counts
4054,i want a new laptop. hp tx2000 is the bomb. :| who knows how much it is? im me:,0,20,103,4.15,6,0,0,0,4,[gabbehhramos@yahoo.com],1
7917,who stole ?,0,3,31,9.0,1,0,0,0,0,[elledell@gmail.com],1
8496,@alexistehpom really? did you send out all the info already? if you did..maybe you could just email me stuff,0,20,130,5.5,11,0,1,0,0,[missataari@gmail.com],1
10290,@laureystack awh...that's kinda sad lol add me??,0,8,76,8.5,0,0,1,0,0,[hello.kitty.65@hotmail.com],1
16413,"@jilliancyork got 2 bottom of it, human error bug from a release last month, being fixed tonight. email for details",0,21,137,5.428571,7,0,1,1,0,[press@linkedin.com],1


##### URLs removal and counts

In [50]:
# 52 min

In [74]:
x = 'hi, visit https://youtube.com/arafat6462 https://youtube.com'

In [75]:
re.findall(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)


[('https', 'youtube.com', '/arafat6462'), ('https', 'youtube.com', '')]

In [72]:
df['url_flag'] = df['tweets'].apply(lambda x: len(re.findall(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)))

In [73]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_word_len,#_counts,@_counts,numeric_counts,upper_counts,email,email_counts,url_flag
0,"@switchfoot http://twitpic.com/2y1zl - awww, that's a bummer. you shoulda got david carr of third day to do it. ;d",0,19,115,5.052632,4,0,1,0,1,[],0,1
1,is upset that he can't update his facebook by texting it... and might cry as a result school today also. blah!,0,21,111,4.285714,9,0,0,0,0,[],0,0
2,@kenichan i dived many times for the ball. managed to save 50% the rest go out of bounds,0,18,89,3.944444,7,0,1,0,1,[],0,0
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0,[],0,0
4,"@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because i can't see you all over there.",0,21,111,4.285714,10,0,1,0,1,[],0,0
