### Project on Sentimental analysis for social media on twitter dataset

In [3]:
!pip install wordcloud


Collecting wordcloud
  Downloading wordcloud-1.9.2-cp311-cp311-win_amd64.whl (151 kB)
     ------------------------------------ 151.4/151.4 kB 475.4 kB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.2



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import pandas as pd
import numpy as np
import contractions
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize import RegexpTokenizer
import string
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


In [5]:
DATASET_COLUMNS=['target','ids','date','flag','user','text']
csv_file = pd.read_csv("D:/twitter_training_data.csv",encoding='latin-1',names=DATASET_COLUMNS)
csv_file.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
# making extra file for 
data = csv_file.copy()
data.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [7]:
# removing unwanted features
data.drop(columns=['ids','date','flag','user'],axis=1)


Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."
...,...,...
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...


In [8]:
data = data[["text",'target']]

In [9]:
data['target'] = data['target'].replace(4,1)
data

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0
...,...,...
1599995,Just woke up. Having no school is the best fee...,1
1599996,TheWDB.com - Very cool to hear old Walt interv...,1
1599997,Are you ready for your MoJo Makeover? Ask me f...,1
1599998,Happy 38th Birthday to my boo of alll time!!! ...,1


### Data cleaning

In [10]:
# any null values
data.isnull().sum()

text      0
target    0
dtype: int64

In [11]:
data.target.unique()

array([0, 1], dtype=int64)

In [12]:
data.iloc[2][0]

'@Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds'

In [13]:
# fixing contractions
def fixing_contractions(text):
    return contractions.fix(text)
data['text'] = data['text'].apply(fixing_contractions)
data.head()
    

Unnamed: 0,text,target
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he cannot update his Facebook by...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it is not behaving at all...",0


In [14]:
# removing user names and https links
def remove_usernames_links(tweet):
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub('http[^\s]+','',tweet)
    return tweet
data['text'] = data['text'].apply(remove_usernames_links)

In [15]:
data.head()

Unnamed: 0,text,target
0,"- Awww, that is a bummer. You shoulda got D...",0
1,is upset that he cannot update his Facebook by...,0
2,I dived many times for the ball. Managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it is not behaving at all. i am mad. why ...",0


In [16]:
data.head()

Unnamed: 0,text,target
0,"- Awww, that is a bummer. You shoulda got D...",0
1,is upset that he cannot update his Facebook by...,0
2,I dived many times for the ball. Managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it is not behaving at all. i am mad. why ...",0


In [17]:
# # removing stop words
stop_words = set(stopwords.words('english'))
def clean_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in set(stop_words)])
data['text'] = data['text'].apply(lambda text: clean_stopwords(text))
data.head()

Unnamed: 0,text,target
0,"- Awww, bummer. You shoulda got David Carr Thi...",0
1,upset cannot update Facebook texting it... mig...,0
2,I dived many times ball. Managed save 50% The ...,0
3,whole body feels itchy like fire,0
4,"no, behaving all. mad. here? I cannot see there.",0


In [18]:
# removing punctuations
punctuations_list = string.punctuation
def remove_punctations(text):
    for i in text:
        for j in punctuations_list:
            if i == j:
                text = text.replace(i,' ')
    return text.strip()

data['text'] = data['text'].apply(remove_punctations)

In [19]:
data

Unnamed: 0,text,target
0,Awww bummer You shoulda got David Carr Third...,0
1,upset cannot update Facebook texting it mig...,0
2,I dived many times ball Managed save 50 The ...,0
3,whole body feels itchy like fire,0
4,no behaving all mad here I cannot see there,0
...,...,...
1599995,Just woke up Having school best feeling ever,1
1599996,TheWDB com Very cool hear old Walt interview...,1
1599997,Are ready MoJo Makeover Ask details,1
1599998,Happy 38th Birthday boo alll time Tupac Ama...,1


In [20]:
# converting to lower case and removing repeating characters
# data['text' ] = data['text'].str.lower()
def clean_repeating_char(text):
    return re.sub(r"(.)\1\1+", r"\1\1", text)
data['text'] = data['text'].apply(clean_repeating_char)
data
 
    

Unnamed: 0,text,target
0,Aww bummer You shoulda got David Carr Third ...,0
1,upset cannot update Facebook texting it might...,0
2,I dived many times ball Managed save 50 The ...,0
3,whole body feels itchy like fire,0
4,no behaving all mad here I cannot see there,0
...,...,...
1599995,Just woke up Having school best feeling ever,1
1599996,TheWDB com Very cool hear old Walt interviews...,1
1599997,Are ready MoJo Makeover Ask details,1
1599998,Happy 38th Birthday boo all time Tupac Amaru ...,1


In [21]:
# cleaning emojis
def clean_emoji(data):    
    data = re.sub(r'<3', '<heart>', data)
    data = re.sub(r"[8:=;]['`\-]?[)d]+", '<smile>', data)
    data = re.sub(r"[8:=;]['`\-]?\(+", '<sad>', data)
    data = re.sub(r"[8:=;]['`\-]?[\/|l*]", '<neutral>', data)
    data = re.sub(r"[8:=;]['`\-]?p+", '<laugh>', data)
    return data
data['text'] = data['text'].apply(lambda x: clean_emoji(x))
data

Unnamed: 0,text,target
0,Aww bummer You shoulda got David Carr Third ...,0
1,upset cannot update Facebook texting it might...,0
2,I dived many times ball Managed save 50 The ...,0
3,whole body feels itchy like fire,0
4,no behaving all mad here I cannot see there,0
...,...,...
1599995,Just woke up Having school best feeling ever,1
1599996,TheWDB com Very cool hear old Walt interviews...,1
1599997,Are ready MoJo Makeover Ask details,1
1599998,Happy 38th Birthday boo all time Tupac Amaru ...,1


In [22]:
# cleaning any numberss - since numbers have no use in detecting the emotion of tweet
def cleaning_number(text):
    return re.sub(r'\d',' ',text)
data['text'] = data['text'].apply(cleaning_number)
data

Unnamed: 0,text,target
0,Aww bummer You shoulda got David Carr Third ...,0
1,upset cannot update Facebook texting it might...,0
2,I dived many times ball Managed save The ...,0
3,whole body feels itchy like fire,0
4,no behaving all mad here I cannot see there,0
...,...,...
1599995,Just woke up Having school best feeling ever,1
1599996,TheWDB com Very cool hear old Walt interviews...,1
1599997,Are ready MoJo Makeover Ask details,1
1599998,Happy th Birthday boo all time Tupac Amaru ...,1


In [23]:
data.isnull().sum()

text      0
target    0
dtype: int64

In [24]:
stop_words = set(stopwords.words('english'))
def clean_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in set(stop_words)])
data['text'] = data['text'].apply(lambda text: clean_stopwords(text))
data.head()

Unnamed: 0,text,target
0,Aww bummer You shoulda got David Carr Third Day D,0
1,upset cannot update Facebook texting might cry...,0
2,I dived many times ball Managed save The rest ...,0
3,whole body feels itchy like fire,0
4,behaving mad I cannot see,0


In [25]:
data

Unnamed: 0,text,target
0,Aww bummer You shoulda got David Carr Third Day D,0
1,upset cannot update Facebook texting might cry...,0
2,I dived many times ball Managed save The rest ...,0
3,whole body feels itchy like fire,0
4,behaving mad I cannot see,0
...,...,...
1599995,Just woke Having school best feeling ever,1
1599996,TheWDB com Very cool hear old Walt interviews â«,1
1599997,Are ready MoJo Makeover Ask details,1
1599998,Happy th Birthday boo time Tupac Amaru Shakur,1


In [26]:
# tokenization
# tokenizing words
data['text'] = data['text'].apply(RegexpTokenizer(r'\w+').tokenize)
data

Unnamed: 0,text,target
0,"[Aww, bummer, You, shoulda, got, David, Carr, ...",0
1,"[upset, cannot, update, Facebook, texting, mig...",0
2,"[I, dived, many, times, ball, Managed, save, T...",0
3,"[whole, body, feels, itchy, like, fire]",0
4,"[behaving, mad, I, cannot, see]",0
...,...,...
1599995,"[Just, woke, Having, school, best, feeling, ever]",1
1599996,"[TheWDB, com, Very, cool, hear, old, Walt, int...",1
1599997,"[Are, ready, MoJo, Makeover, Ask, details]",1
1599998,"[Happy, th, Birthday, boo, time, Tupac, Amaru,...",1


In [None]:
# converting text into numbers
# using bag of words
ps = PorterStemmer()
def stem_text(txt):
    return [ps.stem(word) for word in txt]

data['text'] = data['text'].apply(stem_text)
data

## Model Building

In [None]:
# Input Feature and Label

X = data.text
y = data.target

In [None]:
# Splitting our data into Train and Test
X_train,X_val,y_train,y_val = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [None]:
# TFIDF : Extracting Tf-idf features

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.90, min_df=0.02, max_features=1000, stop_words='english')

tfidf.fit(list(X_train) + list(X_val))
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [None]:
# Naive bayes using Tf-idf features

nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_val_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred_nb, y_val))

In [None]:
n = LogisticRegression()
n.fit(X_train_tfidf,y_train)
y_pred_nb = n.predict(X_val_tfidf)
print('Logistice regression accuaracy %s' % accuracy_score(y_pred_nb,y_val))