# CODE

In [84]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn import svm
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LOADING THE DATASET

In [85]:
train_data = pd.read_csv(r"C:\Users\rohan\Downloads\DS_midterm\train.csv")
test_data = pd.read_csv(r"C:\Users\rohan\Downloads\DS_midterm\test.csv")

print("train.csv shape is ", train_data.shape)
print("test.csv shape is ", test_data.shape)

train.csv shape is  (139753, 9)
test.csv shape is  (13976, 2)


In [86]:
test_data= pd.merge(train_data, test_data, left_on='Id', right_on='Id')
test_data = test_data.drop(columns=['Score_x'])
test_data = test_data.rename(columns={'Score_y': 'Score'})

In [None]:
test_data.head()

PREPROCESSING

In [87]:
def process(trainingSet,submissionSet,col = 'Text'):
    trainingSet['Helpfulness'] = trainingSet['HelpfulnessNumerator'] / trainingSet['HelpfulnessDenominator']
    trainingSet['Helpfulness'] = trainingSet['Helpfulness'].fillna(0)
    
    training_helpful = trainingSet[(trainingSet['HelpfulnessNumerator']<=trainingSet['HelpfulnessDenominator'])]
    training_drop = training_helpful.dropna()
    print("train set after cleaning wrong in helpfulness:   " , trainingSet.shape)
    print("train set after drop NaN:   ",training_drop.shape)

    X_train, X_test, Y_train, Y_test = train_test_split(
        training_drop.drop(['Score'], axis=1),
        training_drop['Score'],
        test_size=1/4.0,
        random_state=0
    )


    if col == 'Text':
        drop_col = ['Id', 'ProductId', 'UserId', 'Summary', 'Time']
    elif col == 'Summary':
        drop_col = ['Id', 'ProductId', 'UserId', 'Text', 'Time']

    X_train_processed = X_train.drop(columns = drop_col)
    X_test_processed = X_test.drop(columns = drop_col)
    submission_processed = submissionSet.drop(columns = drop_col)
    print("train set shape:  ",X_train_processed.shape,"test set shape:  ",X_test_processed.shape)
    
    return X_train_processed,X_test_processed,Y_train,Y_test,submission_processed

CLEANING THE DATASET

In [88]:
#clean the text

def remove_char(x):
    special = '[^A-Za-z ]+'
    x = re.sub(special,'',x)
    x = x.strip()
    x = x.lower()
    return x


def clean_word(dataset,col):
    stop_words = set(stopwords.words('english'))
    test = dataset[col].apply(lambda row: remove_char(str(row))).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    return test

In [89]:
train_data['Text'] = train_data['Text'] + ' ' + train_data['Summary']
test_data['Text'] = test_data['Text']  + ' ' + test_data['Summary']

In [90]:
#SEPERATE THE DATASET 
X_train_processed,X_test_processed,Y_train,Y_test,submission_processed = process(train_data,test_data,'Text')

train set after cleaning wrong in helpfulness:    (139753, 10)
train set after drop NaN:    (125774, 10)
train set shape:   (94330, 4) test set shape:   (31444, 4)


In [91]:
X_train_processed.head()

Unnamed: 0,HelpfulnessNumerator,HelpfulnessDenominator,Text,Helpfulness
85931,0,0,christian movies of what is to become in our w...,0.0
100573,2,2,Leave it to director John Frankenheimer to mak...,1.0
17624,1,2,My grands just loved the 3D version. The 3D ve...,0.5
30520,0,0,"But it SUCKED! OK, it started out really cool...",0.0
8247,146,151,This delightful trilogy tells the story of Eli...,0.966887


In [92]:
# APLLY THE CLEANNING FUNCTION TO DATAFRAME
X_train_text = clean_word(X_train_processed,'Text')
X_test_text = clean_word(X_test_processed,'Text')
submission_text = clean_word(submission_processed,'Text')
X_train_text.head()

85931     christian movies become world today would save...
100573    leave director john frankenheimer make movie t...
17624     grands loved version versions movies order cle...
30520     sucked ok started really cool group like frien...
8247      delightful trilogy tells story elisabeth bavar...
Name: Text, dtype: object

USING TF-IDF TO VECTORIZE THE TEXT COLUMN

In [93]:
#countVector the data
vectorizer_text = CountVectorizer()
X_training_vector_text = vectorizer_text.fit_transform(X_train_text)
X_test_vectorr_text = vectorizer_text.transform(X_test_text)
submission_vectorr_text = vectorizer_text.transform(submission_text)
print("shape of training set CountVector&text: ",X_training_vector_text.shape)

shape of training set CountVector&text:  (94330, 317101)


In [94]:
#tfidf the data
tfidf_text = TfidfVectorizer()
X_training_tfidf_text = tfidf_text.fit_transform(X_train_text)
X_test_tfidf_text = tfidf_text.transform(X_test_text)
submission_tfidf_text = tfidf_text.transform(submission_text)
print("shape of training set TFIDF&text: ",X_training_tfidf_text.shape)


shape of training set TFIDF&text:  (94330, 317101)


In [None]:

clf7 = LogisticRegression(random_state=0,C=80).fit(X_training_tfidf_text,Y_train)
logprevec7 = clf7.predict(X_test_tfidf_text)
print("RMSE on testing set = ", mean_squared_error(Y_test, logprevec7))

SELCTING A MODEL AND TUNING ITS PARAMETERS

In [100]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
model = linear_model.Ridge(alpha=3.0).fit(X_training_tfidf_text,Y_train)
logprevec9= model.predict(X_test_tfidf_text)
print("RMSE on testing set = ", mean_squared_error(Y_test, logprevec9))
#print accuracy
#print("Accuracy on testing set = ", accuracy_score(Y_test, model.predict(X_test_tfidf_text)))


RMSE on testing set =  0.6991291067764173
R^2 score on testing set =  0.50157853724913


TESTING THE MODEL ON THE TEST SET AND SAVING TO A CSV FILE

In [97]:
submission_predict = test_data
submission_predict['Score'] = model.predict(submission_tfidf_text)

submission_output = submission_predict[['Id','Score']]
submission_output.to_csv(r"C:\Users\rohan\Downloads\DS_midterm\submission.csv",index = False)

In [None]:

submission_predict = test_data