In [57]:
import numpy as np
import pandas as pd

In [58]:
df=pd.read_csv("dataset/tweet-sentiment-extraction/train.csv")

In [59]:
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [60]:
df.shape

(27481, 4)

In [63]:
df['sentiment'].value_counts()

sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64

In [62]:
tex=df['text'][999]

In [6]:
tex

' I`ve been unlocked for decades now...just not lucky, never have been.  Gottta make my own luck and that involves $$$$$'

In [7]:
df.isnull().sum()

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64

In [8]:
df.dropna(inplace=True)

In [9]:
df.isnull().sum()

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64

In [10]:
df.duplicated().sum()

0

In [11]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [12]:
df['text'] = df['text'].apply(remove_tags)

In [13]:
df['selected_text'] = df['selected_text'].apply(remove_tags)

In [14]:
df['sentiment'][5333]

'positive'

In [15]:
df.drop(columns=['textID','text'],inplace=True)

In [54]:
df

Unnamed: 0,selected_text,sentiment
0,"i`d responded, going",neutral
1,sooo sad,negative
2,bullying,negative
3,leave alone,negative
4,"sons ****,",negative
...,...,...
27476,lost,negative
27477,", don`t force",negative
27478,yay good you.,positive
27479,worth ****.,positive


In [17]:
df['selected_text'] = df['selected_text'].apply(lambda x:x.lower())

In [18]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')

df['selected_text'] = df['selected_text'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [19]:
df.head()

Unnamed: 0,selected_text,sentiment
0,"i`d responded, going",neutral
1,sooo sad,negative
2,bullying,negative
3,leave alone,negative
4,"sons ****,",negative


In [20]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [21]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [22]:
np.unique(y)

array([0, 1, 2])

In [23]:
ori = encoder.inverse_transform([0,1,2])

In [24]:
ori

array(['negative', 'neutral', 'positive'], dtype=object)

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [26]:
X_train.shape

(21984, 1)

In [27]:
X_test['selected_text'][331]

'k check out...'

In [28]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer

In [29]:
cv = CountVectorizer()

In [30]:
X_train_bow = cv.fit_transform(X_train['selected_text']).toarray()
X_test_bow = cv.transform(X_test['selected_text']).toarray()

In [31]:
X_train_bow.shape

(21984, 15402)

In [32]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

In [33]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.5687772925764192

In [34]:
confusion_matrix(y_test,y_pred)

array([[ 492,  173,  897],
       [ 332, 1105,  756],
       [  52,  160, 1529]], dtype=int64)

In [35]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [36]:
rf.fit(X_train_bow,y_train)

In [37]:
y_pred = rf.predict(X_test_bow)

In [38]:
accuracy_score(y_test,y_pred)

0.7740174672489083

# TF-IDF

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [40]:
tfidf = TfidfVectorizer()

In [42]:
X_train_tfidf = tfidf.fit_transform(X_train['selected_text']).toarray()
X_test_tfidf = tfidf.transform(X_test['selected_text'])

In [43]:
rf = RandomForestClassifier()

In [44]:
rf.fit(X_train_tfidf,y_train)

In [45]:
y_pred = rf.predict(X_test_tfidf)

In [46]:
accuracy_score(y_test,y_pred)

0.8104075691411936

In [49]:
text=X_test['selected_text'][33]

In [52]:
text

'funny.'

In [50]:
test = tfidf.transform([text])

In [51]:
rf.predict(test)

array([2])

In [53]:
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(rf,open('model.pkl','wb'))