In [1]:
from pandas.io.json import json_normalize
import json

In [2]:
# loading the information from the json file and converting it into pandas data frame

with open('./categories_reviews/Automotive_reviews.json', 'r') as file:
    data = f"[{file.read()[:-1]}]"
    data = json.loads(data)
    df_Automotive = json_normalize(data)
    
# make label for positive/negative 1-3 - negative 4-5 positive
df_Automotive['label'] = df_Automotive['stars'].apply(lambda rating:  1 if rating == '5-star' or rating == '4-star' else 0)

In [3]:
# printing the data - top rows - headers to see data
df_Automotive.head()

Unnamed: 0,date,reviewer,stars,text,label
0,2018-02-25,Cheryl Jones,5-star,I arrived at 3 PM and the dealership closed at...,1
1,2017-07-07,Michele Leslie,1-star,I dropped my car off on a Wednesday morning fo...,0
2,2016-03-31,Manuel Atherton,5-star,My parents have been buying cars off of Donna ...,1
3,2016-03-06,Mildred Farrow,5-star,I recently bought another car from Donna Dunni...,1
4,2015-07-20,Rafael Davis,5-star,I had to schedule an appointment due to the ai...,1


In [4]:
# Pre-Processing 
import string
import re
import nltk

# below line is requeired for first time to download stopwords from NLTK package 
# you need to download this at least once when reinstalling the application 
# nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def pre_process(text):
    # convert all text to lower case
    text = text.lower()
   
    # consider only those characters which are not punctuations
    word_list = [char for char in text if char not in string.punctuation]
    
    #remove all numbers as the indicate nothing
    text = ''.join([word for word in word_list if not re.search(r'\d',word)])
    
    # remove stopwords
    word_list = [word for word in text.split() if word not in stop_words]
    # print(word_list)
    return word_list

In [5]:
# machine learning library imports 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC

# transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english", analyzer=pre_process)

# training with State Vector Machines classifier SVC
SVM_model = LinearSVC()

In [6]:
# TRAINING + EVALUATION PART STARTS HERE - SECOND PART

# split the data into Train-Test data set with 70% data as training data and 30% as test data (.3) 
# splitting : 30% + 70% = 100% : We dont want to tran on 100 percent of the data in earlier eval
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_Automotive['text'],df_Automotive["label"], test_size=0.3, random_state=101)


# TRAINING
# train on 70% if data from Automotive Category
training_features = vectorizer.fit_transform(X_train)
SVM_model.fit(training_features,y_train)


# predictions from model 
test_features = vectorizer.transform(X_test)
pred = SVM_model.predict(test_features)


# EVALUATION - shows different rates calculated using confusion matrix
def evaluate_and_show(confusion_mat):
    """This method shows different rates calculated using confusion matrix"""
    total = sum(sum(confusion_mat))

    #TN = True NO - accurate negative review 
    #TY = True Yes - accurate positive review
    #FN = False No - inacurate negative review : we have to evaluate how good our classifyer is 
    #FY = False Yes - inacurate positive review : we have to evaluate how good our classifyer is 
    TN, FN = confusion_mat[0][0], confusion_mat[1][0]
    TY, FY = confusion_mat[1][1], confusion_mat[0][1]
    
    # print confusion matrix properly 
    print(f"confusion matrix: \
      \n Total={total} \t| Predicted NO  | Predicted YES \
      \n Actual NO \t| TN={TN} \t| FY={FY} \
      \n Actual YES \t| FN={FN} \t| TY={TY} \
      \n Total\t\t| {sum(confusion_mat)[0]} \t\t| {sum(confusion_mat)[1]}\n")

    # Evaluation and CALCULATIONS of parameters using confusion matrix
    # Accuracy = Overall, how often is the classifier correct
    accuracy = (TN + TY)/ total # true possitive + true negative 
    print("Accuracy = {:.2f} %".format(accuracy*100))

    #Error % = Overall, how often is it wrong
    error_rate = 1 - accuracy
    print("Error % = {:.2f} %".format(error_rate*100))

    #True Yes Rate (Sensitivity): When it's actually yes, how often does it predict yes 
    Sensitivity = TY/(TY + FN)
    print("Sensitivity = {:.2f} %".format(Sensitivity*100))

    #False Yes Rate: When it's actually no, how often does it predict yes
    FY_rate = FY/(TN + FY)
    print("False Yes rate = {:.2f} %".format(FY_rate*100))
    
    #True No Rate(Specificity): When it's actually no, how often does it predict no
    Specificity = TN/(TN + FY)
    print("Specificity = {:.2f} %".format(Specificity*100))
    
    #Precision: When it predicts yes, how often is it correct? 
    precision = TY/(FY + TY)
    print("Precision rate = {:.2f} %".format(precision*100))
    
    #Prevalence: How often does the yes condition actually occur in our sample? 
    Prevalence = (FN + TY)/total
    print("Prevalence rate = {:.2f} %".format(Prevalence*100))
    
    
# evaluate predictions on test data using confusion matrix and classification report
from sklearn.metrics import confusion_matrix
evaluate_and_show(confusion_matrix(y_test, pred)) # sending confusion matrix core values back to the above function

confusion matrix:       
 Total=437 	| Predicted NO  | Predicted YES       
 Actual NO 	| TN=137 	| FY=22       
 Actual YES 	| FN=16 	| TY=262       
 Total		| 153 		| 284

Accuracy = 91.30 %
Error % = 8.70 %
Sensitivity = 94.24 %
False Yes rate = 13.84 %
Specificity = 86.16 %
Precision rate = 92.25 %
Prevalence rate = 63.62 %


In [7]:
# THIRD PART
# train the classifier on Automotive Category
# training from our model above on 100 percent on Automotive reviews 
# no need to open since we already have the data above 
training_features = vectorizer.fit_transform(df_Automotive['text']) 
SVM_model.fit(training_features, df_Automotive["label"])

# Training Automotive --------> Testing Bars

# Evaluate on Bars review
# opening and reading the json from bar reviews 
# training from our model above on 100 percent on bar reviews reviews
with open('./categories_reviews/Bars_reviews.json', 'r') as file:
    data = f"[{file.read()[:-1]}]"
    data = json.loads(data)
    df_Bars = json_normalize(data)
    
# make label for positive/negative 1-3 - negative 4-5 positive : SAME AS ABOVE
# making constraints on ratings for bar review
df_Bars['label'] = df_Bars['stars'].apply(lambda rating:  1 if rating == '5-star' or rating == '4-star' else 0)

# making Predictions on bar reviews : making reviews for the next predictions which is bar reviews
test_features = vectorizer.transform(df_Bars["text"])
pred = SVM_model.predict(test_features)

# Evaluation : checking from the model above with bar reviews 
evaluate_and_show(confusion_matrix(df_Bars["label"], pred))

confusion matrix:       
 Total=1460 	| Predicted NO  | Predicted YES       
 Actual NO 	| TN=351 	| FY=214       
 Actual YES 	| FN=96 	| TY=799       
 Total		| 447 		| 1013

Accuracy = 78.77 %
Error % = 21.23 %
Sensitivity = 89.27 %
False Yes rate = 37.88 %
Specificity = 62.12 %
Precision rate = 78.87 %
Prevalence rate = 61.30 %


In [8]:
# Train the classifier on Bars Category : THIRD PART
training_features = vectorizer.fit_transform(df_Bars['text'])
SVM_model.fit(training_features, df_Bars["label"])

# Training Bars --------> Testing Automotive



#Test SVC Classifier on Automotive Category
test_features = vectorizer.transform(df_Automotive['text'])
pred = SVM_model.predict(test_features)

# Evaluation
evaluate_and_show(confusion_matrix(df_Automotive["label"], pred))

confusion matrix:       
 Total=1455 	| Predicted NO  | Predicted YES       
 Actual NO 	| TN=431 	| FY=51       
 Actual YES 	| FN=275 	| TY=698       
 Total		| 706 		| 749

Accuracy = 77.59 %
Error % = 22.41 %
Sensitivity = 71.74 %
False Yes rate = 10.58 %
Specificity = 89.42 %
Precision rate = 93.19 %
Prevalence rate = 66.87 %
