In [287]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm
import seaborn as sns

import re
%matplotlib inline

In [288]:
dataset = pd.read_csv('final_classification_dataset.csv')
#dataset.select(lambda x: not re.search('empath_', x), axis=1)

In [289]:
#dataset = dataset.drop('category',axis=1)
dataset = dataset.drop('ASIN',axis=1)
dataset = dataset.drop('salesRank',axis=1)
dataset = dataset.drop('reviewSentimentCom',axis=1)
dataset = dataset.drop('descriptionSentimentCom',axis=1)
# One Hot Encoding.
dataset = pd.get_dummies(dataset)

In [290]:
data_vars = dataset.columns.values.tolist()
# Target Variable
y = 'class'
# Independent Variables - Feature List
X = [i for i in data_vars if i not in y]

# Data transformation for Analysis
X = np.array(dataset[X])
y = np.array(dataset[y])
y = y.ravel()

#print(X.shape)
#print(y.shape)

#print(dataset['class'].value_counts())

In [291]:
# Mututal Information - For feature Selection.
clf = mutual_info_classif(X,y)
#print(clf)

In [293]:
feature_cols = []
for col in dataset.columns:
    if col != 'class':
        feature_cols.append(col)

# pair the feature names with the coefficients
nameCoefDict = {}
for f,c in zip(feature_cols,list(clf)):
    #print(f," : ",c)
    nameCoefDict[f] = c

#from collections import OrderedDict
d_sorted_by_value = OrderedDict(sorted(nameCoefDict.items(), key=lambda x: x[1]))
good_features = []
for k, v in d_sorted_by_value.items():
    if v != 0.0:
        good_features.append(k)

#for f in good_features:
#    print(f,d_sorted_by_value[f])
#len(good_features)  

# Extract features from Mutual Information and apply those features in Logistic Regression and Random Forest.
X_new = dataset[good_features]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.25)

In [294]:
#Logistic Regression with 10 fold cross-validation
logistic = LogisticRegressionCV(cv=10)
logistic = logistic.fit(X_train,y_train)
#print(logistic.coef_)
y_pred = logistic.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(confusion_matrix(y_test,y_pred))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
print("True Negative", tn)
print("False Positive", fp)
print("False Negative", fn)
print("True Positive", tp)

0.565403624382
0.659239240047
[[786 745]
 [574 930]]
True Negative 786
False Positive 745
False Negative 574
True Positive 930


In [295]:
# Apply Odds Ratio to Logistic Regression Coefficients.
nameCoefDict = {}
for f,c in zip(feature_cols,list(np.exp(logistic.coef_)[0])):
    #print(f," : ",c)
    nameCoefDict[f] = c
    
from collections import OrderedDict
d_sorted_by_value = OrderedDict(sorted(nameCoefDict.items(), key=lambda x: x[1]))
good_features = []
for k, v in d_sorted_by_value.items():
    if v > 1.5:
        good_features.append(k)
# Print scores after Odds Ratio
for f in good_features:
    print(f,d_sorted_by_value[f])
len(good_features)       

empath_description_warmth 1.50712667822
empath_description_negative_emotion 1.51153388824
empath_description_horror 1.51186768634
empath_description_divine 1.51632432092
empath_description_party 1.54766072472
empath_description_ugliness 1.55623374593
empath_description_achievement 1.59306845047
empath_description_art 1.60015500828
empath_description_kill 1.6109185783
empath_description_giving 1.61591212959
empath_description_exercise 1.6264944501
empath_description_domestic_work 1.63287137585
empath_description_pride 1.69873221995
empath_review_hate 1.72095814404
empath_description_neglect 1.72139476464
empath_description_ocean 1.75910102948
empath_description_monster 1.76600289145
empath_description_crime 1.77217950974
empath_description_technology 1.77949064058
empath_description_reading 1.85752670139
empath_description_gain 1.8698174191
empath_description_noise 1.87033402242
empath_review_money 1.93036136831
empath_description_beauty 2.01584786484
empath_description_valuable 2.01910

65

In [298]:
# data_vars = dataset.columns.values.tolist()
# # Target Variable
# y = 'class'
# # Independent Variables - Feature List
# X = [i for i in data_vars if i not in y]

# # Data transformation for Analysis
# X = np.array(dataset[X])
# y = np.array(dataset[y])
# y = y.ravel()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Random Forest Classifier.
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=250,max_features=150)
forest.fit(X_train,y_train)
#print(forest.feature_importances_)
predict = forest.predict(X_test)
print(accuracy_score(y_test,predict))
print(np.sqrt(mean_squared_error(y_test, predict)))
print(confusion_matrix(y_test,predict))
tn, fp, fn, tp = confusion_matrix(y_test,predict).ravel()
print("True Negative", tn)
print("False Positive", fp)
print("False Negative", fn)
print("True Positive", tp)

0.605271828666
0.628273962006
[[855 676]
 [522 982]]
True Negative 855
False Positive 676
False Negative 522
True Positive 982
