In [None]:
import os
import sys
import pymongo
from time import sleep
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# splitting data
from sklearn.model_selection import train_test_split

# Features Extraction
from sklearn.feature_extraction.text import CountVectorizer

# Evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC


In [None]:
sys.path.append(os.path.abspath('../scraping_cleaning'))
from cleaning_data import *

In [None]:
df_positive = pd.read_csv('../csv_files/positive_reviews.csv')
df_negative = pd.read_csv('../csv_files/negative_reviews.csv')
df_positive = df_positive.iloc[:8000]

In [None]:
df_negative.dropna(inplace=True)
df_positive.dropna(inplace=True)

In [None]:
len(df_negative)

In [None]:
len(df_positive)

In [None]:
df_negative['polarity'] = 0
df_positive['polarity'] = 1

In [None]:
df_positive = df_positive[['positive_review', 'polarity']]
df_negative = df_negative[['negative_review', 'polarity']]

In [None]:
sys.path.append(os.path.abspath('../file_handles'))
from handle_combind_shuffle_reviews import *

In [None]:
df_positive = df_positive.rename(columns={'positive_review': 'reviews'})
df_negative = df_negative.rename(columns={'negative_review': 'reviews'})

In [None]:
df_positive.head()

In [None]:
df_negative.head()

In [None]:
combined_reviews = combine_positive_negative_reviews(df_positive, df_negative)
cleaned_reviews = list(combined_reviews['reviews'])
# cleaned_reviews =list(cleaned_reviews)

In [None]:
cleaned_reviews = arabic_pip_line(cleaned_reviews)

In [None]:
combined_reviews['reviews'] = cleaned_reviews

In [None]:
combined_reviews.head()

In [None]:
df = shuffle_dataframe_of_reviews(combined_reviews)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
len(df)

In [None]:
X = df['reviews'] 
y = df['polarity']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1)
X_train[:5]

In [None]:
print("Our training data now are: " + str(len(X_train))  + " Reviews")
print("Our testing data now are: " + str(len(X_test))  + " Reviews")
print("Our training data now are: " + str(len(y_train))  + " labels")
print("Our testing data now are: " + str(len(y_test))  + " labels")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorizer(df):
    '''
    Argumen:
        df dataframe of multiple reviews
    return:
        Train & test arrays that can fir to the model
    '''
# I fit the vector to all of the data
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer = tfidf_vectorizer.fit(X) 
    word_idf_weights = tfidf_vectorizer.idf_
    print("Our 10 words weights\n\n",word_idf_weights[:10])
# fit splited data
    testing_data = tfidf_vectorizer.transform(X_test)
    training_data = tfidf_vectorizer.transform(X_train) 
# convert to array that can apply to ML model
    training_data = training_data.toarray()
    testing_data = testing_data.toarray()
    return training_data, testing_data

In [None]:
training_data, testing_data = tfidf_vectorizer(df)

In [None]:
# first shape is the data itself and second shape is the BOW in our data
print("Our new vectorized data: " + str(training_data.shape))
print("Our new vectorized data: " + str(testing_data.shape)) 
print("The first 2 review after transform: \n", testing_data[:2])

In [None]:
clf_MultinomialNB = MultinomialNB()

In [None]:
model = clf_MultinomialNB.fit(training_data, y_train)

In [None]:
predict = model.predict(training_data)

In [None]:
print("F1 score of our training data is: ", f1_score(y_train, predict, average='micro'))

In [None]:
print("Evalution Matrix of training data is \n", confusion_matrix(y_train, predict))

In [None]:
predict = model.predict(testing_data)

In [None]:
print("F1 score of our testing data is: ", f1_score(y_test, predict, average='micro'))

In [None]:
print("Evalution Matrix of training data is \n", confusion_matrix(y_test, predict))

In [None]:
clf_LogisticRegression = LogisticRegression(penalty='l2', tol=0.00001, solver='liblinear',max_iter=1000)

In [None]:
logistic_model = clf_LogisticRegression.fit(training_data, y_train)

In [None]:
predict = logistic_model.predict(training_data)

In [None]:
print("F1 score of our testing data is: ", f1_score(y_train, predict, average='micro'))

In [None]:
print("Evalution Matrix of training data is \n", confusion_matrix(y_train, predict))

In [None]:
predict = logistic_model.predict(testing_data)

In [None]:
print("F1 score of our testing data is: ", f1_score(y_test, predict, average='micro'))

In [None]:
print("Evalution Matrix of training data is \n", confusion_matrix(y_test, predict))

In [None]:
clf_SVC = SVC(kernel='linear')

In [None]:
svc_model = clf_SVC.fit(training_data, y_train)

In [None]:
predict = svc_model.predict(training_data)

In [None]:
print("F1 score of our testing data is: ", f1_score(y_train, predict, average='micro'))

In [None]:
print("Evalution Matrix of training data is \n", confusion_matrix(y_train, predict))