# Reading and Understanding Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve

In [None]:
data=pd.read_csv('sample30.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
round(100*data.isnull().sum()/len(data),2)

# Data Cleaning

In [None]:
data=data.drop(['reviews_userProvince','reviews_userCity'],axis=1)
data.head()

In [None]:
round(100*data.isnull().sum()/len(data),2)

In [None]:
data['manufacturer']=data['manufacturer'].replace(np.nan,'Other')
data['reviews_date']=data['reviews_date'].replace(np.nan,'Other')
data['reviews_didPurchase']=data['reviews_didPurchase'].replace(np.nan,'Unknown')
data['reviews_title']=data['reviews_title'].replace(np.nan,'No Title')
data['reviews_username']=data['reviews_username'].replace(np.nan,'No username')

In [None]:
round(100*data.isnull().sum()/len(data),2)

In [None]:
data.head()

In [None]:
#number of products
products_count = data['reviews_rating'].groupby(data['id']).count()
print("Number of Unique Products = {}".format(products_count.count()))

In [None]:
# Top 20 products
sorted_products = products_count.sort_values(ascending=False)

print("Top 20 Reviewed Products:\n")
print(sorted_products[:20], end='\n\n')
print('Most Reviewed Product, AVpf3VOfilAPnD_xjpun - has {} reviews.'.format(products_count.max()))

In [None]:
print("Last 20 Reviewed Products:\n")
print(sorted_products[251:], end='\n\n')
print('Most Reviewed Product, AV13O1A8GV-KLJ3akUyj - has {} reviews.'.format(products_count.min()))

In [None]:
data['feedback']=data['reviews_title'] +' '+ data['reviews_text']
data.head()

In [None]:
data1=data[['id','name','feedback','reviews_doRecommend','user_sentiment','reviews_rating','reviews_username']]
data1.head()

In [None]:
data1.dropna(inplace=True)
data1.isnull().sum()

In [None]:
data1.shape

In [None]:
data1=data1.groupby('name').filter(lambda x:len(x)>250).reset_index(drop=True)
print('Number of products=>',len(data1['name'].unique()))

In [None]:
data1['reviews_doRecommend']=data1['reviews_doRecommend'].astype(int)

In [None]:
data1.head()

# Data Unbalanced

In [None]:
sns.factorplot(x="user_sentiment", data=data1, kind="count", size=6, aspect=1.5, palette="PuBuGn_d")
plt.show()

# Text Preprocessing

In [None]:
' '.join(data1['feedback'].tolist())

In [None]:
contractions={"ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

In [None]:
def con_to_exp(x):
  if type(x) is str:
    x=x.replace('\\','')
    for key in contractions:
      value = contractions[key]
      x=x.replace(key,value)
    return x
  else:
    return x  

In [None]:
%%time
data1['feedback']=data1['feedback'].apply(lambda x: con_to_exp(x))

In [None]:
data1.head()

In [None]:
data1['feedback_lower']=data1['feedback'].str.lower()

In [None]:
import string
Remove_to_punctuation=string.punctuation

In [None]:
def remove_punctuation(text):
  return text.translate(str.maketrans('','',Remove_to_punctuation))

data1['text_wo_punc']=data1['feedback_lower'].apply(lambda x: remove_punctuation(x))

In [None]:
data1.head()

In [None]:
data1=data1.drop('feedback_lower',axis=1)

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stpwrd=set(stopwords.words('english'))

def remove_stopwords(text):
  return " ".join([word for word in str(text).split() if word not in stpwrd])

data1['text_wo_sw']=data1['text_wo_punc'].apply(lambda x: remove_stopwords(x))

In [None]:
data1.head()

In [None]:
data1=data1.drop('text_wo_punc',axis=1)
data1.head()

In [None]:
from collections import Counter
count= Counter()
for text in data1['text_wo_sw'].values:
  for word in text.split():
    count[word] += 1

count.most_common(10)

In [None]:
Freqwords=set([w for (w,wc) in count.most_common(10)  ])
def remove_frequentwords(text):
  return " ".join([word for word in str(text).split() if word not in Freqwords])


data1['text_wo_Freq_sw']=data1['text_wo_sw'].apply(lambda x: remove_frequentwords(x))
data1['text_wo_Freq_sw'] = data1['text_wo_Freq_sw'].str.replace(r'[^A-Za-z0-9 ]+', ' ')
data1.head()

# stemming

In [None]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_word(text):
  return " ".join([stemmer.stem(word) for word in text.split()])


data1['stem_text']=data1['text_wo_Freq_sw'].apply(lambda x: stem_word(x))
data1.head()

# Lemmetization

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

lemmatizer = WordNetLemmatizer()

def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
def lemma_word(text):
  nltk_tagged = nltk.pos_tag(nltk.word_tokenize(text))
  wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
  lemmatized_sentence = []
  for word, tag in wordnet_tagged:
    if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
    else:        
            #else use the tag to lemmatize the token
           lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
  return " ".join(lemmatized_sentence)
  #return " ".join([wordnet_lemmatizer.lemmatize(word) for  word in text.split()])


data1['lemma_text']=data1['text_wo_Freq_sw'].apply(lambda x: lemma_word(x))

In [None]:
data1.head()

In [None]:
data1['user_sentiment']=data1['user_sentiment'].map({'Positive':1,'Negative':0})

In [None]:
data1.head()

In [None]:
#changing the name of the column
data1=data1.rename(columns={'lemma_text':'Final_text'})
data1.head()

In [None]:
data1=data1.drop(['stem_text','text_wo_Freq_sw','text_wo_Freq_sw','text_wo_sw','feedback'],axis=1)
data1.head()

# Model Building

In [None]:
# Splitting the Data Set into Train and Test Sets
X = data1['Final_text']
y = data1['user_sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
# Print train and test set shape
print ('Train Set Shape\t\t:{}\nTest Set Shape\t\t:{}'.format(X_train.shape, X_test.shape))

TF-IDF

In [None]:
# Create the word vector with TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
tfidf_vect = TfidfVectorizer(ngram_range=(1, 1))
tfidf_vect_train = tfidf_vect.fit_transform(X_train)
tfidf_vect_train = tfidf_vect_train.toarray()
tfidf_vect_test = tfidf_vect.transform(X_test)
tfidf_vect_test = tfidf_vect_test.toarray()

Logistic Regression Model without oversamping

In [None]:
model = list()
resample = list()
precision = list()
recall = list()
F1score = list()
AUCROC = list()

In [None]:
def test_eval(clf_model, X_test, y_test, algo=None, sampling=None):
    # Test set prediction
    y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('AUC-ROC')
    print('='*60)
    print(roc_auc_score(y_test, y_prob[:,1]))
          
    model.append(algo)
    precision.append(precision_score(y_test,y_pred))
    recall.append(recall_score(y_test,y_pred))
    F1score.append(f1_score(y_test,y_pred))
    AUCROC.append(roc_auc_score(y_test, y_prob[:,1]))
    resample.append(sampling)

In [None]:
### Setting up the model class
lr_model_tf_idf = LogisticRegression()

## Training the model 
lr_model_tf_idf.fit(tfidf_vect_train,y_train)

## Prediciting the results
test_pred_lr_all = lr_model_tf_idf.predict(tfidf_vect_test)

## Evaluating the model
print("F1 score: ",f1_score(y_test, test_pred_lr_all))

In [None]:
test_eval(lr_model_tf_idf, tfidf_vect_test, y_test, 'Logistic Regression', 'actual')

Logistic Regression with SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

counter = Counter(y_train)
print('Before',counter)
# oversampling the train dataset using SMOTE
smt = SMOTE()
#X_train, y_train = smt.fit_resample(X_train, y_train)
X_train_sm, y_train_sm = smt.fit_resample(tfidf_vect_train, y_train)

counter = Counter(y_train_sm)
print('After',counter)

In [None]:
lr_model_tf_idf.fit(X_train_sm, y_train_sm)

In [None]:
test_eval(lr_model_tf_idf, tfidf_vect_test, y_test, 'Logistic Regression', 'smote')

XGBoost without Sampling

In [None]:
NB_tf_idf= MultinomialNB()
NB_tf_idf.fit(tfidf_vect_train,y_train)

test_pred_NB = NB_tf_idf.predict(tfidf_vect_test)

## Evaluating the model
print("F1 score: ",f1_score(y_test, test_pred_NB))

In [None]:
test_eval(NB_tf_idf, tfidf_vect_test, y_test, 'Naive Bayes', 'actual')

In [None]:
NB_tf_idf.fit(X_train_sm,y_train_sm)

In [None]:
test_eval(NB_tf_idf, tfidf_vect_test, y_test, 'Naive Bayes', 'smote')

In [None]:
from sklearn.ensemble import RandomForestClassifier
### Setting up the model class
rf_model_tf_idf = RandomForestClassifier()

## Training the model 
rf_model_tf_idf.fit(tfidf_vect_train,y_train)

## Prediciting the results
test_pred_rf = rf_model_tf_idf.predict(tfidf_vect_test)

## Evaluating the model
print("F1 score: ",f1_score(y_test, test_pred_rf))

In [None]:
test_eval(rf_model_tf_idf, tfidf_vect_test, y_test, 'Random Forest', 'actual')

In [None]:
rf_model_tf_idf.fit(X_train_sm,y_train_sm)

In [None]:
test_eval(rf_model_tf_idf, tfidf_vect_test, y_test, 'Random Forest', 'smote')

In [None]:
clf_eval_df = pd.DataFrame({'model':model,
                            'resample':resample,
                            'precision':precision,
                            'recall':recall,
                            'f1-score':F1score,
                            'AUC-ROC':AUCROC})

In [None]:
clf_eval_df

In [None]:
data1.head()

Recommendation Engine

In [None]:
# we are creating second dataframe for recommendation Engine
data2=data1[['name','reviews_rating','reviews_username']]
data2.head()

In [None]:
data2['reviews_rating'].describe()

So Ratings are from 1 to 5

In [None]:
data2.shape

In [None]:
data2 = data2.groupby('name').filter(lambda x: len(x)>=500)
data2.shape

In [None]:
data2.duplicated().sum()

In [None]:
## Let's remove these duplicate rows
data2.drop_duplicates(inplace = True)
data2.shape

In [None]:
data2.head()

# Train Test Split in Recommendation Engine

## USer-User Based

In [None]:
train, test = train_test_split(data2, test_size = 0.30, random_state = 30)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
df_pivot_features = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).fillna(0)

In [None]:
df_pivot_features.head()

In [None]:
# Copy the train dataset into dummy_train
dummy_train = train.copy()
dummy_test = test.copy()


In [None]:
dummy_train['reviews_rating'] = dummy_train['reviews_rating'].apply(lambda x: 0 if x>=1 else 1)

In [None]:
dummy_train = dummy_train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).fillna(1)

In [None]:
dummy_train.head()

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# Creating the User Similarity Matrix using pairwise_distance function.
user_correlation = 1 - pairwise_distances(df_pivot_features, metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

In [None]:
user_correlation.shape

In [None]:
df_pivot_features = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
)

In [None]:
df_pivot_features.head()

In [None]:
mean = np.nanmean(df_pivot_features, axis=1)
df_subtracted = (df_pivot_features.T-mean).T


In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# Creating the User Similarity Matrix using pairwise_distance function.
user_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
user_correlation[np.isnan(user_correlation)] = 0
print(user_correlation)

In [None]:
# Prediction - User User
user_correlation[user_correlation<0]=0
user_correlation

In [None]:
user_predicted_ratings = np.dot(user_correlation, df_pivot_features.fillna(0))
user_predicted_ratings

In [None]:
user_predicted_ratings.shape

In [None]:
user_final_rating = np.multiply(user_predicted_ratings,dummy_train)
user_final_rating.head()

TOP 5 Recommendation for the User

In [None]:
# Take the user ID as input.
user_input = '02deuce'
print(user_input)

In [None]:
d = user_final_rating.loc[user_input].sort_values(ascending=False)[0:5]   # SIMILARIRY 
d

# MODEL EVALUATION

In [None]:
# Find out the common users of test and train dataset.
common = test[test.reviews_username.isin(train['reviews_username'])]
common.shape
common.head()

In [None]:
common_user_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating')

In [None]:
user_correlation_df = pd.DataFrame(user_correlation)

df_subtracted.head(1)

user_correlation_df['reviews_username'] = df_subtracted.index

user_correlation_df.set_index('reviews_username',inplace=True)
user_correlation_df.head()

common.head(1)

In [None]:
list_name = common.reviews_username.tolist()

In [None]:
user_correlation_df.columns = df_subtracted.index.tolist()

In [None]:
user_correlation_df_1 =  user_correlation_df[user_correlation_df.index.isin(list_name)]

user_correlation_df_1.shape

In [None]:
user_correlation_df_2 = user_correlation_df_1.T[user_correlation_df_1.T.index.isin(list_name)]

user_correlation_df_3 = user_correlation_df_2.T

user_correlation_df_3.head()

user_correlation_df_3.shape

In [None]:
user_correlation_df_3[user_correlation_df_3<0]=0

In [None]:
common_user_predicted_ratings = np.dot(user_correlation_df_3, common_user_based_matrix.fillna(0))
common_user_predicted_ratings

In [None]:
dummy_test = common.copy()

In [None]:
dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').fillna(0)

dummy_test.shape

common_user_based_matrix.head()

dummy_test.head()

common_user_predicted_ratings = np.multiply(common_user_predicted_ratings,dummy_test)

common_user_predicted_ratings.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = common_user_predicted_ratings.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)


In [None]:
common_ = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating')

In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))


In [None]:
rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

# Item Based Filtering

In [None]:
df_pivot_item = train.pivot_table(
    index='reviews_username',
    columns='name',
    values='reviews_rating'
).T

In [None]:
df_pivot_item.head()

In [None]:
mean = np.nanmean(df_pivot_item, axis=1)
df_subtracted = (df_pivot_item.T-mean).T

df_subtracted.head()

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

# Item Similarity Matrix
item_correlation = 1 - pairwise_distances(df_subtracted.fillna(0), metric='cosine')
item_correlation[np.isnan(item_correlation)] = 0
print(item_correlation)

item_correlation.shape

In [None]:
item_correlation[item_correlation<0]=0
item_correlation

In [None]:
item_predicted_ratings = np.dot((df_pivot_item.fillna(0).T),item_correlation)
item_predicted_ratings

item_predicted_ratings.shape

dummy_train.shape

In [None]:
item_final_rating = np.multiply(item_predicted_ratings,dummy_train)
item_final_rating.head()

# Top 5 Recommendation for the User

In [None]:
# Take the user ID as input
user_input = input('Enter Username')
print(user_input)

In [None]:
# Recommending the Top 5 products to the user.
d = item_final_rating.loc[user_input].sort_values(ascending=False)[0:5]
d

# Evaluation

In [None]:
common =  test[test.name.isin(train.name)]
common.shape

In [None]:
common.head(4)

In [None]:
common_item_based_matrix = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

common_item_based_matrix.shape

In [None]:
item_correlation_df = pd.DataFrame(item_correlation)

item_correlation_df.head(1)

In [None]:
item_correlation_df['name'] = df_subtracted.index
item_correlation_df.set_index('name',inplace=True)
item_correlation_df.head()

In [None]:
list_name = common.name.tolist()

In [None]:
item_correlation_df.columns = df_subtracted.index.tolist()

item_correlation_df_1 =  item_correlation_df[item_correlation_df.index.isin(list_name)]

item_correlation_df_2 = item_correlation_df_1.T[item_correlation_df_1.T.index.isin(list_name)]

item_correlation_df_3 = item_correlation_df_2.T

item_correlation_df_3.head()

In [None]:
item_correlation_df_3[item_correlation_df_3<0]=0

In [None]:
common_item_predicted_ratings = np.dot(item_correlation_df_3, common_item_based_matrix.fillna(0))
common_item_predicted_ratings

In [None]:
common_item_predicted_ratings.shape

In [None]:
dummy_test = common.copy()

dummy_test['reviews_rating'] = dummy_test['reviews_rating'].apply(lambda x: 1 if x>=1 else 0)

dummy_test = dummy_test.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T.fillna(0)

common_item_predicted_ratings = np.multiply(common_item_predicted_ratings,dummy_test)

In [None]:
common_ = common.pivot_table(index='reviews_username', columns='name', values='reviews_rating').T

In [None]:
from sklearn.preprocessing import MinMaxScaler
from numpy import *

X  = common_item_predicted_ratings.copy() 
X = X[X>0]

scaler = MinMaxScaler(feature_range=(1, 5))
print(scaler.fit(X))
y = (scaler.transform(X))

print(y)


In [None]:
# Finding total non-NaN value
total_non_nan = np.count_nonzero(~np.isnan(y))

In [None]:
rmse = (sum(sum((common_ - y )**2))/total_non_nan)**0.5
print(rmse)

## As we can see that we get more RMSE with item based filtering so we will take this item based for recommendation approach.
## Also In Model Building we see that we have more accuracy in Random Forest with smote so we will apply RF for model Building.

# EOF