In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, pairwise

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Touch\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
X_train_raw = pd.read_csv("./data/X_train.csv")
X_submission_raw = pd.read_csv("./data/X_submission.csv")

In [5]:
# This is where you can do more feature extraction
def process_df(X_train_raw, X_submission_raw, columns_drop, submission_drop):
    
    X_train_raw = X_train_raw[X_train_raw['HelpfulnessNumerator'] <= X_train_raw['HelpfulnessDenominator']]
    X_train_raw = X_train_raw.dropna()
    print("Size of data:", X_train_raw.shape)
    
    X_train, X_test, Y_train, Y_test = train_test_split(
        X_train_raw.drop(['Score'], axis=1),
        X_train_raw['Score'],
        test_size=1/4.0,
        random_state=0
    )
    
    X_train_processed = X_train.drop(columns=columns_drop)
    X_test_processed = X_test.drop(columns=columns_drop)
    X_submission_processed = X_submission_raw.drop(columns=submission_drop)
    
    return X_train_processed, X_test_processed, X_submission_processed, Y_train, Y_test

X_train_processed, X_test_processed, X_submission_processed, Y_train, Y_test = process_df(X_train_raw,X_submission_raw, ['Id', 'ProductId', 'UserId', 'Summary', 'Time'], ['Id', 'ProductId', 'UserId', 'Summary', 'Score'])


Size of data: (1397455, 9)


In [6]:
def remove_special_char(text):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

In [7]:
def preprocess_text_nosent(df, column):
#     texts = df[column]
#     processed_text = []
#     count = 0
#     for t in texts:
#         if(count != 0 and count % 100000 == 0):
#             print(count)
#         processed_text.append(remove_special_char(str(t)).strip().lower())
#         count+=1
    processed_texts = df[column].apply(lambda row: remove_special_char(str(row)).strip().lower())
    
    return processed_texts

In [9]:
def count_vectorizer(df_train_processed, df_test_processed, n_gram=(1,1)):
    vectorizer = CountVectorizer(stop_words="english", ngram_range=n_gram)
    X_train = vectorizer.fit_transform(df_train_processed)
    X_test = vectorizer.transform(df_test_processed)
    X_submission = vectorizer.transform(df_submission_processed)
    
    return X_train, X_test, X_submission

In [8]:
def tfidf_vectorizer(df_train_processed, df_test_processed, df_submission_processed):
    vectorizer = TfidfVectorizer(stop_words="english")
    X_train = vectorizer.fit_transform(df_train_processed)
    X_test = vectorizer.transform(df_test_processed)
    X_submission = vectorizer.transform(df_submission_processed)
    
    return X_train, X_test, X_submission

In [10]:
df_train_processed_nosent = preprocess_text_nosent(X_train_processed, 'Text')
df_test_processed_nosent = preprocess_text_nosent(X_test_processed, 'Text')
df_submission_processed_nosent = preprocess_text_nosent(X_submission_processed, 'Text')

In [None]:
# X_train_cv, X_test_cv, X_submission_cv = count_vectorizer(df_train_processed_nosent, df_test_processed_nosent, df_submission_processed_nosent)
vectorizer = CountVectorizer(stop_words="english", ngram_rangetuple=(1,2))
X_train_cv = vectorizer.fit_transform(df_train_processed_nosent)
X_test_cv = vectorizer.transform(df_test_processed_nosent)
X_submission_cv = vectorizer.transform(df_submission_processed_nosent)

In [12]:
#X_train_tfidf, X_test_tfidf = tfidf_vectorizer(df_train_processed_nosent, df_test_processed_nosent)
vectorizer_tfidf = TfidfVectorizer(stop_words="english")
X_train_tfidf = vectorizer_tfidf.fit_transform(df_train_processed_nosent)
X_test_tfidf = vectorizer_tfidf.transform(df_test_processed_nosent)
X_submission_tfidf = vectorizer_tfidf.transform(df_submission_processed_nosent)

In [13]:
mnb = MultinomialNB().fit(X_train_cv, Y_train)

In [14]:
mnb_tfidf = MultinomialNB().fit(X_train_tfidf, Y_train)

In [15]:
Y_test_predictions_mnb = mnb.predict(X_test_cv)
print("RMSE on testing set = ", mean_squared_error(Y_test, Y_test_predictions_mnb))

RMSE on testing set =  1.3147233258149094


In [16]:
Y_test_predictions_mnb_tfidf = mnb_tfidf.predict(X_test_tfidf)
print("RMSE on testing set = ", mean_squared_error(Y_test, Y_test_predictions_mnb_tfidf))

RMSE on testing set =  2.210339359521874


In [None]:
X_submission_raw['Score'] = mnb.predict(X_submission_cv)
# X_submission_raw['Score'] = mnb_tfidf.predict(X_submission_tfidf)

In [None]:
submission = X_submission_raw[['Id', 'Score']]
submission.to_csv("./data/submission.csv", index=False)