# Name : Katha Patel
# Roll No : 41
# GR No : 11810454
# Assignment 7 - Sentiment Analysis of twitter data using Python
## Importing the libraries

In [1]:
import pandas as pd
from scipy import stats
import re
import numpy as np
import nltk
from nltk.corpus import stopwords

## Importing the dataset

In [2]:
df = pd.read_csv("dataset/tweets1.csv")
df

Unnamed: 0,sentiment_text,sentiment
0,is so sad for my APL friend.............,0
1,I missed the New Moon trailer…,0
2,.. Omgaga. Im sooo im gunna CRy. I've been at...,0
3,i think mi bf is cheating on me!!! ...,0
4,or i just worry too much?,0
...,...,...
1128,missin' someone badly. ha ha 'someone'. that's...,1
1129,Missing @kieronhughes and looking forward to s...,1
1130,Missing @smartbrain from BKK na.,1
1131,"Missing @twisteddspirit, why did you have to m...",1


## Dataset properties

In [3]:
df.shape

(1133, 2)

In [4]:
df.groupby('sentiment').count()

Unnamed: 0_level_0,sentiment_text
sentiment,Unnamed: 1_level_1
0,700
1,433


## Remove stop words form the dataset

In [5]:
df["sentiment_text"]

0                is so sad for my APL friend.............
1                          I missed the New Moon trailer…
2       .. Omgaga. Im sooo  im gunna CRy. I've been at...
3                i think mi bf is cheating on me!!!   ...
4                       or i just worry too much?        
                              ...                        
1128    missin' someone badly. ha ha 'someone'. that's...
1129    Missing @kieronhughes and looking forward to s...
1130                    Missing @smartbrain from BKK na. 
1131    Missing @twisteddspirit, why did you have to m...
1132    Missing Amarillo and the lake, but loving Temp...
Name: sentiment_text, Length: 1133, dtype: object

In [6]:
from nltk.tokenize import word_tokenize
df["sentiment_text"] = df["sentiment_text"].str.lower()
df["sentiment_text"] = df["sentiment_text"].apply(word_tokenize)
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['sentiment_text']= df['sentiment_text'].apply(lambda x: [item for item in x if item not in stop])
df['sentiment_text']=df['sentiment_text'].apply(lambda x:' '.join([item for item in x]))
df['sentiment_text']

0                        sad apl friend ... ... ... ... .
1                                missed new moon trailer…
2       .. omgaga . im sooo im gunna cry . 've dentist...
3                          think mi bf cheating ! ! ! t_t
4                                            worry much ?
                              ...                        
1128    missin ' someone badly . ha ha 'someone ' . 's...
1129    missing @ kieronhughes looking forward seeing ...
1130                        missing @ smartbrain bkk na .
1131    missing @ twisteddspirit , move ta-hatch-a-pea...
1132              missing amarillo lake , loving temple .
Name: sentiment_text, Length: 1133, dtype: object

## Remove Extra characters form the dataset

In [7]:
features = df.iloc[:, 0].values
labels = df.iloc[:, 1].values

In [8]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)
df['sentiment_text'] = np.array(processed_features)
df['sentiment_text']

0                                         sad apl friend 
1                                missed new moon trailer 
2        omgaga im sooo im gunna cry ve dentist since ...
3                                think mi bf cheating t_t
4                                             worry much 
                              ...                        
1128    missin someone badly ha ha someone mystery might 
1129    missing kieronhughes looking forward seeing later
1130                           missing smartbrain bkk na 
1131    missing twisteddspirit move ta hatch pea fresn...
1132                 missing amarillo lake loving temple 
Name: sentiment_text, Length: 1133, dtype: object

## Lemmatization of the words

In [9]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

lemmatizer = WordNetLemmatizer()
df['sentiment_text'] = df['sentiment_text'].apply(word_tokenize)
df['sentiment_text']= df['sentiment_text'].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
df['sentiment_text']=df['sentiment_text'].apply(lambda x:' '.join([item for item in x]))
df['sentiment_text']

0                                          sad apl friend
1                                 missed new moon trailer
2       omgaga im sooo im gunna cry ve dentist since 1...
3                                think mi bf cheating t_t
4                                              worry much
                              ...                        
1128     missin someone badly ha ha someone mystery might
1129    missing kieronhughes looking forward seeing later
1130                            missing smartbrain bkk na
1131    missing twisteddspirit move ta hatch pea fresn...
1132                  missing amarillo lake loving temple
Name: sentiment_text, Length: 1133, dtype: object

## Transforming the dataset into numerical data using Count vectorizer

In [10]:
processed_feature = df.iloc[:, 0].values

In [11]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=2000,min_df=3,max_df=0.6,stop_words = stopwords.words('english'))
processed_feature = vectorizer.fit_transform(processed_feature).toarray()

## Splitting the dataset into training and testing dataset

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_feature, labels, test_size = 0.25, random_state = 0)

## KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [14]:
y_pred = classifier.predict(X_test)

## Calculating the confusion matrix and accuracy

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[148  13]
 [ 97  26]]


0.6126760563380281

### It gives 61.2% accuracy
## SVM

In [16]:
from sklearn.svm import SVC 
classifiersvm = SVC(kernel='linear', random_state=0)  
classifiersvm.fit(X_train, y_train) 
y_pred = classifiersvm.predict(X_test)

## Calculating the confusion matrix and accuracy

In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[138  23]
 [ 63  60]]


0.6971830985915493

###  It gives 69.7% accuracy
## Naive Bayes

In [19]:
from sklearn.naive_bayes import MultinomialNB
gnb = MultinomialNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

## Calculating the confusion matrix and accuracy

In [20]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[140  21]
 [ 54  69]]


0.7359154929577465

### It gives 73.5% accuracy
## Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
LogReg_clf = LogisticRegression(random_state = 0)
LogReg_clf.fit(X_train, y_train)
y_pred = LogReg_clf.predict(X_test)

## Calculating the confusion matrix and accuracy

In [22]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[143  18]
 [ 68  55]]


0.6971830985915493

### It gives 69.7% accuracy
## Random Forest Classification

In [23]:
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=100, random_state=0)
text_classifier.fit(X_train, y_train)
y_pred = text_classifier.predict(X_test)

## Calculating the confusion matrix and accuracy

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[133  28]
 [ 51  72]]


0.721830985915493

### It gives 72.1% accuracy