In [39]:
import sklearn
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [40]:
sentiment_data = pd.read_csv('datasets/sentimental_analysis_data.csv',
                             header=None,
                             names=['Label', 'Text'],
                             sep='\t')

sentiment_data.sample(10)

Unnamed: 0,Label,Text
572,1,"The Da Vinci Code was awesome, I can't wait to..."
3817,1,Brokeback Mountain was an AWESOME movie.
3550,1,I either LOVE Brokeback Mountain or think it's...
6282,0,"Oh, and Brokeback Mountain is a TERRIBLE movie..."
3975,0,"then was the da vinci code, which sucked reall..."
1893,1,"So as felicia's mom is cleaning the table, fel..."
1873,1,"So as felicia's mom is cleaning the table, fel..."
6216,0,Ok brokeback mountain is such a horrible movie.
6770,0,", she helped me bobbypin my insanely cool hat ..."
2052,1,Write us a Harry Potter poem for a chance to w...


In [41]:
X = sentiment_data['Text']
Y = sentiment_data['Label']

In [42]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [43]:
tfidf_vect = TfidfVectorizer(max_features=15)

In [44]:
logistic_clf = LogisticRegression(solver='liblinear')

In [45]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', logistic_clf)])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [46]:
y_pred = pipeline_model.predict(x_test)

In [47]:
Accuracy_score = accuracy_score(y_test, y_pred)
Accuracy_score

0.8894508670520231

In [48]:
pickle.dump(pipeline_model, open('models/logistic_clf/model.pkl', 'wb'))

In [49]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

In [50]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', decision_tree_clf)])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [51]:
y_pred = pipeline_model.predict(x_test)

In [52]:
Accuracy_score = accuracy_score(y_test, y_pred)
Accuracy_score

0.8937861271676301

In [53]:
pickle.dump(pipeline_model, open('models/decision_tree_clf/model.pkl', 'wb'))

In [55]:
linear_svc_clf = LinearSVC(C=1.0, max_iter=100)

In [56]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', linear_svc_clf)])
pipeline_model = clf_pipeline.fit(x_train, y_train)

In [57]:
y_pred = pipeline_model.predict(x_test)

In [58]:
Accuracy_score = accuracy_score(y_test, y_pred)
Accuracy_score

0.8894508670520231

In [59]:
pickle.dump(pipeline_model, open('models/linear_svc_clf/model.pkl', 'wb'))