## Twitter data sentiment analysis

## 1. Importing libraries

In [186]:
import pandas as pd

## 2. Importing dataset of twitts

In [289]:
df = pd.read_csv('training.1600000.processed.noemoticon.csv',encoding='ISO-8859-1')

In [290]:
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [291]:
df.columns = ['label','ID','Timestamp','NO_QUERY','_TheSpecialOne_','Data']
df.drop(['ID','Timestamp','NO_QUERY','_TheSpecialOne_'],axis = 1,inplace=True)

## 3. Dataframe preparation

**3.1 Collecting set of posivite and negative tweets**

In [292]:
df2 = df.iloc[0:200000]
df3 = df[800000:1000000]

In [293]:
df2.label.value_counts()

0    200000
Name: label, dtype: int64

**"0" indicates negative tweet**

In [193]:
df3.label.value_counts()

4    200000
Name: label, dtype: int64

**"4" indicates Positive tweet**

**3.2 Mergine both sets in single dataframe**

In [294]:
df4 = pd.concat([df2,df3])
df4.reset_index(inplace=True)

## 4. Importing Natural langugae processing libraries

In [198]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizing = WordNetLemmatizer()

## 5. Data preprocessing

**5.1 Using Regular Expresson (re) and nltk library for data processing**

Removing all punctuations from dataset

Converting all words to lower case format

Preparing list of words from sentences

Removing stopwards from dataset

Lemmatizing the words

Joining all the words to form the sentence

Collecting Processed data into 'data'

In [199]:
data =[]
for i in range(0, len(df4)):
    corrected_data = re.sub('[^a-zA-Z]',' ',df4['Data'][i])
    corrected_data = corrected_data.lower()
    corrected_data = corrected_data.split()
    
    corrected_data = [lemmatizing.lemmatize(word) for word in corrected_data if word not in set(stopwords.words('english'))]
    corrected_data = ' '.join(corrected_data)
    data.append(corrected_data)

**5.2 Coverting Strings into numerical vector format with TfidfVectorizer**

In [200]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizing = CountVectorizer()
X = vectorizing.fit_transform(data)
y = df4.label

## 6.Model training and Evaluation

**6.1 Splitting dataset in the form of Training and Testing dataset using train_test_split method** 

In [201]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

**6.2 Training the model with naive_bayes classification Algorithm**

In [202]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [203]:
model.fit(X_train,y_train)

MultinomialNB()

**6.3 Predicting the value for test dataset** 

In [204]:
y_predict = model.predict(X_test)

**6.4 Evaluating the result using confusion_matrix, classification_report**

In [205]:
from sklearn.metrics import confusion_matrix, classification_report

In [206]:
print(confusion_matrix(y_predict,y_test))
print(classification_report(y_predict,y_test))

[[47309 16577]
 [12527 43587]]
              precision    recall  f1-score   support

           0       0.79      0.74      0.76     63886
           4       0.72      0.78      0.75     56114

    accuracy                           0.76    120000
   macro avg       0.76      0.76      0.76    120000
weighted avg       0.76      0.76      0.76    120000



In [265]:
new_data = ['You are bad person']

In [266]:
vect_new_data = vectorizing.transform(new_data)

In [267]:
new_predict = model.predict(vect_new_data)

if new_predict == 0:
    print('It is a negative tweet')

elif new_predict == 4:
    print('It is a Positive tweet')
    

It is a negative tweet


## 7. Saving model with pickle

In [268]:
import pickle


In [269]:
pickle.dump(vectorizing, open('countvectorizer.pkl', 'wb'))

In [270]:
pickle.dump(model, open('twitter_nlp.pkl', 'wb'))

In [276]:
pkl_model = pickle.load(open('twitter_nlp.pkl', 'rb'))

In [283]:
pkl_vectorizing = pickle.load(open('countvectorizer.pkl','rb'))

## 8. Predicting result with saved model

In [1]:
data = 'i am happy'

In [281]:
data = [data]

In [284]:
data = pkl_vectorizing.transform(data)

In [285]:
pkl_predict = pkl_model.predict(data)

if pkl_predict == 0:
    print('It is a negative tweet')

elif pkl_predict == 4:
    print('It is a Positive tweet')
    

It is a Positive tweet
