In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
import pickle

In [2]:
tweet_df = pd.read_csv("train_preprocessed.csv")

In [3]:
tweet_df.describe()

Unnamed: 0.1,Unnamed: 0,ItemID,Sentiment
count,99989.0,99989.0,99989.0
mean,49994.0,50005.110042,0.564632
std,28864.48237,28865.894393,0.495808
min,0.0,1.0,0.0
25%,24997.0,25009.0,0.0
50%,49994.0,50006.0,1.0
75%,74991.0,75003.0,1.0
max,99988.0,100000.0,1.0


In [4]:
tweet_df["clean_text"]=tweet_df["clean_text"].astype('U')
tweet_df.head()

Unnamed: 0.1,Unnamed: 0,ItemID,Sentiment,SentimentText,clean_text
0,0,1,0,is so sad for my APL frie...,apl sad friend
1,1,2,0,I missed the New Moon trail...,trailer moon miss new
2,2,3,1,omg its already 7:30 :O,omg alreadi
3,3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...,sooo sinc crown thi dentist gunna get put cri ...
4,4,5,0,i think mi bf is cheating on me!!! ...,think t_t cheat


In [5]:
pipeline = Pipeline([
   ( 'bow',CountVectorizer()),
    ('classifier',MultinomialNB()),
])

In [6]:
from sklearn.model_selection import train_test_split
msg_train,msg_test,label_train,label_test = train_test_split(tweet_df['clean_text'],tweet_df['Sentiment'],test_size=0.3)

In [7]:
print(len(msg_train))
print(len(label_train))
print(len(msg_test))
print(len(label_test))

69992
69992
29997
29997


In [8]:
pipeline.fit(msg_train, label_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('classifier',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [9]:
predictions = pipeline.predict(msg_test)

In [10]:
print(classification_report(predictions, label_test))

              precision    recall  f1-score   support

           0       0.63      0.74      0.68     11265
           1       0.82      0.74      0.78     18732

    accuracy                           0.74     29997
   macro avg       0.73      0.74      0.73     29997
weighted avg       0.75      0.74      0.74     29997



In [11]:
print(confusion_matrix(predictions, label_test))

[[ 8308  2957]
 [ 4812 13920]]


In [12]:
text ="friday havent listen realli mainten think night twitter"
pipeline.predict([text])[0]

1

In [13]:
df_pos = tweet_df[tweet_df["Sentiment"]==1]
df_neg = tweet_df[tweet_df["Sentiment"]==0]

In [14]:
df_pos.count()

Unnamed: 0       56457
ItemID           56457
Sentiment        56457
SentimentText    56457
clean_text       56457
dtype: int64

In [15]:
df_neg.count()

Unnamed: 0       43532
ItemID           43532
Sentiment        43532
SentimentText    43532
clean_text       43532
dtype: int64

In [16]:
df_pos_sample = df_pos.sample(n=43532)
df_combine = df_pos_sample.append(df_neg)

In [17]:
df_combine.groupby('Sentiment').describe()

Unnamed: 0_level_0,ItemID,ItemID,ItemID,ItemID,ItemID,ItemID,ItemID,ItemID,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0,Unnamed: 0
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
0,43532.0,47383.876413,29754.229279,1.0,20397.75,46716.0,73366.0,99998.0,43532.0,47373.154185,29752.35923,0.0,20385.75,46704.0,73354.0,99986.0
1,43532.0,52014.731347,27992.8119,3.0,28265.75,52391.0,76096.25,100000.0,43532.0,52003.322843,27991.814193,2.0,28253.75,52379.0,76084.25,99988.0


In [18]:
msg_train2,msg_test2,label_train2,label_test2 = train_test_split(df_combine['clean_text'],df_combine['Sentiment'],test_size=0.2)


In [22]:
model_pipeline = pipeline.fit(msg_train2, label_train2)

In [23]:
predictions2 = model_pipeline.predict(msg_test2)

In [24]:
print(classification_report(predictions2, label_test2))

              precision    recall  f1-score   support

           0       0.72      0.76      0.74      8203
           1       0.78      0.73      0.76      9210

    accuracy                           0.75     17413
   macro avg       0.75      0.75      0.75     17413
weighted avg       0.75      0.75      0.75     17413



In [25]:
pickle.dump(model_pipeline,open("model_pipeline.pkl","wb"))

In [29]:
df_pos_sample = df_pos.sample(n=5000)
df_neg_sample = df_neg.sample(n=5000)

In [30]:
df_combine2 = df_pos_sample.append(df_neg_sample)

In [31]:
msg_train2,msg_test2,label_train2,label_test2 = train_test_split(df_combine['clean_text'],df_combine['Sentiment'],test_size=0.2)
model_pipeline = pipeline.fit(msg_train2, label_train2)
predictions2 = model_pipeline.predict(msg_test2)
print(classification_report(predictions2, label_test2))

              precision    recall  f1-score   support

           0       0.72      0.75      0.73      8353
           1       0.76      0.73      0.75      9060

    accuracy                           0.74     17413
   macro avg       0.74      0.74      0.74     17413
weighted avg       0.74      0.74      0.74     17413

