# Sentiment of Movie Reviews dataset 
Size of dataset : 6918 rows, 2columns

# Import Libraries

In [1]:
import sklearn
import pickle
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Read data using pandas
Taking clean data for demonstration of model deployment using cloud finctions

In [2]:
sentimental_data = pd.read_csv('datasets/sentimental_analysis_data.csv', 
                               header=None, 
                               names=['Label', 'Text'], 
                               sep='\t')

sentimental_data.sample(10)

Unnamed: 0,Label,Text
1525,1,the last stand and Mission Impossible 3 both w...
6717,0,"Oh, and Brokeback Mountain was a terrible movie."
2969,1,I love Harry Potter.
5849,0,"Is it just me, or does Harry Potter suck?..."
4234,0,da vinci code sucks...
6578,0,Brokeback Mountain is fucking horrible..
3987,0,ERM da vinci code and it sucked..
99,1,"I loved the Da Vinci code, and I can't wait fo..."
5876,0,"I hate Harry Potter, that daniel wotshisface n..."
6749,0,"Then snuck into Brokeback Mountain, which is t..."


In [3]:
sentimental_data.shape

(6918, 2)

# Taking text into variable X and target into Y

In [4]:
X = sentimental_data['Text']

Y = sentimental_data['Label']

# Split the data into 80% adn 20% for training and testing

In [5]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

# TF-IDF 
TFIDF, short for term frequency–inverse document frequency, is a numerical statistic that is 
intended to reflect how important a word is to a document in a collection or corpus.
TfidfVectorizer - Transforms text to feature vectors that can be used as input to estimator.
vocabulary_ Is a dictionary that converts each token (word) to feature index in the matrix, 
each unique token gets a feature index.

In [6]:
#TfidfVectorizer will pick out the top 50 features ordered by their term frequency
# but not by their Tf-idf score.

tfidf_vect = TfidfVectorizer(max_features=15)

In [23]:
print(tfidf_vect.get_feature_names())

['and', 'awesome', 'brokeback', 'code', 'da', 'harry', 'impossible', 'is', 'love', 'mission', 'mountain', 'potter', 'the', 'vinci', 'was']


# Apply Logistic Regression Model and save the model as .pkl file

In [7]:
logistic_clf = LogisticRegression(solver='liblinear')

In [8]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', logistic_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [9]:
y_pred = pipeline_model.predict(x_test)

In [10]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.9039017341040463

In [11]:
pickle.dump(pipeline_model, open('models/logistic_clf/model.pkl', 'wb'))

# Apply Decisition Tree Model and save the model as .pkl file

In [12]:
decision_tree_clf = DecisionTreeClassifier(max_depth=10)

In [13]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', decision_tree_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [14]:
y_pred = pipeline_model.predict(x_test)

In [15]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.9010115606936416

In [16]:
pickle.dump(pipeline_model, open('models/decision_tree_clf/model.pkl', 'wb'))

# Apply Linear SVC and save the model as .pkl file

In [17]:
linear_svc_clf = LinearSVC(C=1.0, max_iter=100)

In [18]:
clf_pipeline = Pipeline(steps=[('tfidf_vect', tfidf_vect), ('classifier', linear_svc_clf)])

pipeline_model = clf_pipeline.fit(x_train, y_train)

In [19]:
y_pred = pipeline_model.predict(x_test)

In [20]:
Accuracy_score = accuracy_score(y_test, y_pred)

Accuracy_score

0.903179190751445

In [21]:
pickle.dump(pipeline_model, open('models/linear_svc_clf/model.pkl', 'wb'))