In [55]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, pairwise

In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Touch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
X_train_raw = pd.read_csv("./data/X_train.csv")
X_submission_raw = pd.read_csv("./data/X_submission.csv")

In [12]:
X_train_raw.shape

(1397533, 9)

In [32]:
X_train_raw.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,5019281,ADZPIG9QOCDG5,0,0,4.0,1203984000,good version of a classic,This is a charming version of the classic Dick...
1,1,5019281,A35947ZP82G7JH,0,0,3.0,1388361600,Good but not as moving,It was good but not as emotionally moving as t...
2,2,5019281,A3UORV8A9D5L2E,0,0,3.0,1388361600,Winkler's Performance was ok at best!,"Don't get me wrong, Winkler is a wonderful cha..."
3,3,5019281,A1VKW06X1O2X7V,0,0,5.0,1202860800,It's an enjoyable twist on the classic story,Henry Winkler is very good in this twist on th...
4,4,5019281,A3R27T4HADWFFJ,0,0,4.0,1387670400,Best Scrooge yet,This is one of the best Scrooge movies out. H...


In [43]:
X_train_raw.head()

Unnamed: 0,Id,ProductId,UserId,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,0,5019281,ADZPIG9QOCDG5,0,0,4.0,1203984000,good version of a classic,This is a charming version of the classic Dick...
1,1,5019281,A35947ZP82G7JH,0,0,3.0,1388361600,Good but not as moving,It was good but not as emotionally moving as t...
2,2,5019281,A3UORV8A9D5L2E,0,0,3.0,1388361600,Winkler's Performance was ok at best!,"Don't get me wrong, Winkler is a wonderful cha..."
3,3,5019281,A1VKW06X1O2X7V,0,0,5.0,1202860800,It's an enjoyable twist on the classic story,Henry Winkler is very good in this twist on th...
4,4,5019281,A3R27T4HADWFFJ,0,0,4.0,1387670400,Best Scrooge yet,This is one of the best Scrooge movies out. H...


<h2> NB Model for Rating-to-Feature </h2>

In [5]:
def preprocess_text_analyzer(text):
    text = remove_special_char(str(text)).strip().lower()
    return [w for w in word_tokenize(text) if w not in stopwords.words('english')]

In [27]:
def remove_special_char(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [50]:
def preprocess_for_text_helpfulness(X_train_raw, X_submission_raw, columns_drop, submission_drop):
    X_train_raw = X_train_raw[X_train_raw['HelpfulnessNumerator'] <= X_train_raw['HelpfulnessDenominator']]
    X_train_raw = X_train_raw.dropna()

    temp_df = pd.DataFrame(np.unique(X_train_raw['UserId']), columns=['unique_ID'])
    temp_df['unique_ID_int'] = range(1, len(temp_df['unique_ID'].index)+1)
    X_train_raw = pd.merge(X_train_raw, temp_df, left_on='UserId', right_on='unique_ID')
    X_train_raw = X_train_raw.drop(columns=['unique_ID'])
    
    X_train, X_test, Y_train, Y_test = train_test_split(
        X_train_raw.drop(['Score'], axis=1),
        X_train_raw['Score'],
        test_size=1/4.0,
        random_state=0
    )
    
    X_train_processed = X_train.drop(columns=columns_drop)
    X_test_processed = X_test.drop(columns=columns_drop)
    X_submission_processed = X_submission_raw.drop(columns=submission_drop)
    
    return X_train_processed, X_test_processed, X_submission_processed, Y_train, Y_test

In [51]:
X_train_NB, X_test_NB, X_submission_NB, Y_train_NB, Y_test_NB = preprocess_for_text_helpfulness(X_train_raw, X_submission_raw, ['Id', 'ProductId', 'UserId', 'Summary', 'Time'], ['Id', 'ProductId', 'UserId', 'Text', 'Summary', 'Score'])

In [52]:
X_train_NB.head()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Text,unique_ID_int
214081,0,2,This is an excelent movie! Gregory peck is ins...,122851
544289,3,14,"If I could , I would give this movie no stars....",1313
500506,3,3,first time i saw this i thought it was a borin...,90809
493690,0,0,I had no idea that this was a Disney movie unt...,103081
579854,1,1,Watching children.in my home and they love ani...,106945


In [28]:
pipeline = Pipeline([
    ('Tf-Idf', TfidfVectorizer(ngram_range=(1,1), analyzer=preprocess_text_analyzer)),
    ('classifier', MultinomialNB())
])

In [None]:
pipeline.fit(X_train_NB['Text'], Y_train_NB)

In [None]:
pip_pred = pipeline.predict(X_test_processed['Text'])

In [None]:
rev_test_pred_NB_df = pd.DataFrame(data={'Text test': X_test_processed['Text'], 'prediction': pip_pred})
rev_test_pred_NB_df.to_csv('pred_NB_df.csv')

<h2> kNN Model for Feature-to-User </h2>

In [53]:
X_train_similarity = X_train_NB.drop(columns=['Text'])

In [None]:
pairwise.cosine_similarity(X_train_similarity, X_train_similarity)