In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split

In [None]:
imdb = pd.read_csv('/home/kafka/kafka/imdb.csv', encoding='ISO-8859-1')
#imdb['label'].value_counts()
#imdb.head()

In [None]:
imdb_ok = imdb[['review', 'label']]

In [None]:
imdb = imdb_ok[imdb_ok.label != "unsup"]

In [None]:
def label_sentiment (row):

  if row['label'] == "neg" :
    return 0
  if row['label'] == "pos" :
    return 1

In [None]:
imdb_def = imdb.apply (lambda row: label_sentiment(row), axis=1)
imdb_def

0        0
1        0
2        0
3        0
4        0
        ..
49995    1
49996    1
49997    1
49998    1
49999    1
Length: 50000, dtype: int64

In [None]:
imdb['target'] = imdb_def
imdb.tail()

Unnamed: 0,review,label,target
49995,"Seeing as the vote average was pretty low, and...",pos,1
49996,"The plot had some wretched, unbelievable twist...",pos,1
49997,I am amazed at how this movie(and most others ...,pos,1
49998,A Christmas Together actually came before my t...,pos,1
49999,Working-class romantic drama from director Mar...,pos,1


In [None]:
seed = 123

In [None]:
# Measuring run time
from time import time

# Data manipulation/analysis
import numpy as np

# Text preprocessing/analysis
import re, random
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:

def preprocess_text(text):
    # 1. Tokenise to alphabetic tokens
    tokeniser = RegexpTokenizer(r'[A-Za-z]+')
    tokens = tokeniser.tokenize(text)

    # 2. Lowercase and lemmatise 
    lemmatiser = WordNetLemmatizer()
    tokens = [lemmatiser.lemmatize(t.lower(), pos='v') for t in tokens]
    return tokens

In [None]:
vectoriser = TfidfVectorizer(analyzer=preprocess_text, min_df=30, max_df=.7)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV


In [None]:
X_train, X_test, y_train, y_test = train_test_split(imdb['review'], imdb['target'], test_size=5000, random_state=seed)

In [None]:
pipe = Pipeline([('vectoriser', 
                  TfidfVectorizer(token_pattern=r'[a-z]+', 
                                  min_df=30, 
                                  max_df=.6, 
                                  ngram_range=(1,2))),
                 ('model', 
                  SGDClassifier(random_state=seed, loss='hinge'))])

pipe.fit(X_train, y_train)

Pipeline(steps=[('vectoriser',
                 TfidfVectorizer(max_df=0.6, min_df=30, ngram_range=(1, 2),
                                 token_pattern='[a-z]+')),
                ('model', SGDClassifier(random_state=123))])

In [None]:
############# THE COEFICIENTS OF OUR PIPELINE 

coefs = pd.DataFrame(pipe['model'].coef_, 
                     columns=pipe['vectoriser'].get_feature_names())

coefs = coefs.T.rename(columns={0:'coef'}).sort_values('coef')


#### CHECK REUSLTS
print(coefs.head(10))
print('\n')
print(coefs.tail(10))

               coef
bad       -4.959635
worst     -4.603756
awful     -4.118285
boring    -3.778384
the worst -3.594560
poor      -3.436840
terrible  -3.126976
waste     -2.959757
nothing   -2.949903
worse     -2.608583


               coef
well       2.079229
today      2.342685
brilliant  2.350588
the best   2.373143
amazing    2.404742
fun        2.416445
wonderful  2.466989
perfect    2.682182
excellent  3.656383
great      4.111817


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
train_pred = pipe.predict(X_train)
target_names=['negative', 'positive']
print(classification_report(train_pred, 
                            y_train, 
                            target_names=target_names))
                            

              precision    recall  f1-score   support

    negative       0.93      0.94      0.94     22150
    positive       0.95      0.93      0.94     22850

    accuracy                           0.94     45000
   macro avg       0.94      0.94      0.94     45000
weighted avg       0.94      0.94      0.94     45000



In [None]:
import pickle

In [None]:
# Dump the trained decision tree classifier with Pickle
pipe_filename = './home/kafka/kafka/model.pkl'
# Open the file to save as pkl file
decision_tree_model_pkl = open(pipe_filename, 'wb')
pickle.dump(pipe, decision_tree_model_pkl)
# Close the pickle instances
decision_tree_model_pkl.close()
