In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('/content/twitter_training.csv',names=['id', 'topic', 'sentiment', 'tweet'])
df.head()

Unnamed: 0,id,topic,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
# Removing id col bcz does not provide meaningful information for EDA

df.drop(columns=['id'],inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   topic      74682 non-null  object
 1   sentiment  74682 non-null  object
 2   tweet      73996 non-null  object
dtypes: object(3)
memory usage: 1.7+ MB


In [5]:
df.shape

(74682, 3)

In [6]:
# Null values
df.isnull().sum()

Unnamed: 0,0
topic,0
sentiment,0
tweet,686


In [7]:
# I have 74646 row data and 686 null value i will drop it bcz its in textual form
# 686 is less number of data
(df.isnull().sum()['tweet']/df.shape[0]) * 100

# here it is less than 1 percenatage so worries!


0.9185613668621622

In [8]:
# Drop Null rows

df.dropna(inplace = True)

In [9]:
df.isnull().sum()

Unnamed: 0,0
topic,0
sentiment,0
tweet,0


In [10]:
df.shape


(73996, 3)

# Text Preprocessing
- Lower case
- Removing HTML tags
- Remove Puctuation

In [11]:
# Lower Case
df['topic'] = df['topic'].str.lower()
df['sentiment'] = df['sentiment'].str.lower()
df['tweet'] = df['tweet'].str.lower()

In [12]:
df.head()

Unnamed: 0,topic,sentiment,tweet
0,borderlands,positive,im getting on borderlands and i will murder yo...
1,borderlands,positive,i am coming to the borders and i will kill you...
2,borderlands,positive,im getting on borderlands and i will kill you ...
3,borderlands,positive,im coming on borderlands and i will murder you...
4,borderlands,positive,im getting on borderlands 2 and i will murder ...


In [13]:
# Removing HTML tags
import re

def remove_html_tags(text):
  pattern = re.compile('<.*?>')
  return pattern.sub('',text)

In [14]:
df['tweet'] = df['tweet'].apply(remove_html_tags)

In [15]:
df.head()

Unnamed: 0,topic,sentiment,tweet
0,borderlands,positive,im getting on borderlands and i will murder yo...
1,borderlands,positive,i am coming to the borders and i will kill you...
2,borderlands,positive,im getting on borderlands and i will kill you ...
3,borderlands,positive,im coming on borderlands and i will murder you...
4,borderlands,positive,im getting on borderlands 2 and i will murder ...


In [16]:
# Remove Puctuation

In [17]:
import string

exclude = string.punctuation
exclude = exclude + ('123456789')
def remove_punc(text):
  for char in exclude:
    if char in text:
      text = text.replace(char,'')
  return text


In [18]:
df['tweet'] = df['tweet'].apply(remove_punc)

In [19]:
df.head()

Unnamed: 0,topic,sentiment,tweet
0,borderlands,positive,im getting on borderlands and i will murder yo...
1,borderlands,positive,i am coming to the borders and i will kill you...
2,borderlands,positive,im getting on borderlands and i will kill you all
3,borderlands,positive,im coming on borderlands and i will murder you...
4,borderlands,positive,im getting on borderlands and i will murder y...


In [20]:
# chat word treatment

In [21]:
# I need to write a regular expression for this text and convert this into dictionary

chat_words = """AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
QPSA?	Que Pasa?
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
TFW – That feeling when. TFW internet slang often goes in a caption to an image.
MFW – My face when
MRW – My reaction when
IFYP – I feel your pain
LOL – Laughing out loud
TNTL – Trying not to laugh
JK – Just kidding
IDC – I don’t care
ILY – I love you
IMU – I miss you
ADIH – Another day in hell
IDC – I don’t care
ZZZ – Sleeping, bored, tired
WYWH – Wish you were here
TIME – Tears in my eyes
BAE – Before anyone else
FIMH – Forever in my heart
BSAAW – Big smile and a wink
BWL – Bursting with laughter
LMAO – Laughing my a** off
BFF – Best friends forever
CSL – Can’t stop laughing
PC – Personal Computer"""

In [22]:
# Converted to lower-case
chat_words = chat_words.lower()
print(chat_words)

afaik=as far as i know
afk=away from keyboard
asap=as soon as possible
atk=at the keyboard
atm=at the moment
a3=anytime, anywhere, anyplace
bak=back at keyboard
bbl=be back later
bbs=be back soon
bfn=bye for now
b4n=bye for now
brb=be right back
brt=be right there
btw=by the way
b4=before
b4n=bye for now
cu=see you
cul8r=see you later
cya=see you
faq=frequently asked questions
fc=fingers crossed
fwiw=for what it's worth
fyi=for your information
gal=get a life
gg=good game
gn=good night
gmta=great minds think alike
gr8=great!
g9=genius
ic=i see
icq=i seek you (also a chat program)
ilu=ilu: i love you
imho=in my honest/humble opinion
imo=in my opinion
iow=in other words
irl=in real life
kiss=keep it simple, stupid
ldr=long distance relationship
lmao=laugh my a.. off
lol=laughing out loud
ltns=long time no see
l8r=later
mte=my thoughts exactly
m8=mate
nrn=no reply necessary
oic=oh i see
pita=pain in the a..
prt=party
prw=parents are watching
qpsa?	que pasa?
rofl=rolling on the floor laugh

In [23]:
# converted to dictionary
abbrevation = {}

for i in chat_words.split('\n'):
  if '=' in i:
    short_full = i.split("=")
    abbrevation[short_full[0].strip()] = short_full[1].strip()
  if '–' in i:
    short_full = i.split("–")
    abbrevation[short_full[0].strip()] = short_full[1].strip()

abbrevation

{'afaik': 'as far as i know',
 'afk': 'away from keyboard',
 'asap': 'as soon as possible',
 'atk': 'at the keyboard',
 'atm': 'at the moment',
 'a3': 'anytime, anywhere, anyplace',
 'bak': 'back at keyboard',
 'bbl': 'be back later',
 'bbs': 'be back soon',
 'bfn': 'bye for now',
 'b4n': 'bye for now',
 'brb': 'be right back',
 'brt': 'be right there',
 'btw': 'by the way',
 'b4': 'before',
 'cu': 'see you',
 'cul8r': 'see you later',
 'cya': 'see you',
 'faq': 'frequently asked questions',
 'fc': 'fingers crossed',
 'fwiw': "for what it's worth",
 'fyi': 'for your information',
 'gal': 'get a life',
 'gg': 'good game',
 'gn': 'good night',
 'gmta': 'great minds think alike',
 'gr8': 'great!',
 'g9': 'genius',
 'ic': 'i see',
 'icq': 'i seek you (also a chat program)',
 'ilu': 'ilu: i love you',
 'imho': 'in my honest/humble opinion',
 'imo': 'in my opinion',
 'iow': 'in other words',
 'irl': 'in real life',
 'kiss': 'keep it simple, stupid',
 'ldr': 'long distance relationship',
 'lm

In [24]:
# Function for chat words treatment
# parameter -> lowercase, strip

def chat_word_treatment(text):
  new_text = []
  for w in text.split():
    if w in abbrevation:
      new_text.append(abbrevation[w])
    else:
      new_text.append(w)
  return " ".join(new_text)


In [25]:
chat_word_treatment('   pc   ')

'personal computer'

In [26]:
df['tweet'] = df['tweet'].apply(lambda x: chat_word_treatment(x))

In [27]:
#Spelling Correction

In [28]:
# Accuracy is little low
from textblob import TextBlob

In [29]:
def spelling_correction(text):
  textBlb = TextBlob(text)
  return textBlb.correct().string

In [30]:
# This spelling correction is taking too long so not running
# df['tweet'].apply(spelling_correction)

In [31]:
# Removing Stop words

In [32]:
import nltk
from nltk.corpus import stopwords
nltk.download('all')
# stopwords.words('english')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [33]:
stop_words = stopwords.words('english')
def remove_stopwords(text):
  new_text = []
  for word in text.split():
    if word in stop_words:
      new_text.append('')
    else:
      new_text.append(word)
  return " ".join(new_text)


In [34]:
remove_stopwords("hello my name is aravindra")

'hello  name  aravindra'

In [35]:
df['tweet'] = df['tweet'].apply(remove_stopwords)

In [36]:
# Stemming & #Tokenization

In [37]:
#Tokenization
import spacy
nlp = spacy.load('en_core_web_sm')

# Stemming
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [38]:
def token_stem_words(text):
  new_text = []

  doc1 = nlp(text)
  for word in doc1:
    new_text.append(ps.stem(word.text))

  return " ".join(new_text)



In [39]:
# sample = token_words('Aravindra is kissing machine learning')
# sample

In [40]:
# This code will take lot time to complete
df['transformed_text'] = df['tweet'].apply(token_stem_words)

In [41]:
df.head()

Unnamed: 0,topic,sentiment,tweet,transformed_text
0,borderlands,positive,im getting borderlands murder,i m get borderland murder
1,borderlands,positive,coming borders kill,come border kill
2,borderlands,positive,im getting borderlands kill,i m get borderland kill
3,borderlands,positive,im coming borderlands murder,i m come borderland murder
4,borderlands,positive,im getting borderlands murder,i m get borderland murder


# Feature Engineering
- Topic, tweet -> bag of words
- sentiment -> label encoder
- train test split

In [59]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [60]:
# When this code runs colab jst restart's so seperate df['topic'] & df['tweet'] for fit_transform
combined_text = df['topic'] + " " + df['tweet']
X = cv.fit_transform(combined_text).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [44]:
# label Encoder for Y

In [62]:
from sklearn.preprocessing import LabelEncoder

In [63]:
le = LabelEncoder()

In [64]:
Y = le.fit_transform(df['sentiment'])
Y

array([3, 3, 3, ..., 3, 3, 3])

# Model Training

In [65]:

from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [52]:
# gnb.fit(X_train, y_train)
# y_pred1 = gnb.predict(X_test)
# print(accuracy_score(y_test,y_pred1))
# print(confusion_matrix(y_test,y_pred1))
# print(precision_score(y_test,y_pred1))