In [3]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import model_selection
import nltk
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error
from math import sqrt


In [4]:
raw = pd.read_csv('./train.csv')
raw

Unnamed: 0,HelpfulnessDenominator,HelpfulnessNumerator,Id,ProductId,Score,Summary,Text,Time,UserId
0,0,0,130058,B000CQIDHY,5.0,A worthy and welcome replacement,I don't know what has happened to formulation ...,1337817600,A3VZR9TPF2GERB
1,0,0,91622,B004YV80OE,4.0,"It was okay, good flavor",Kraft's a safe brand. They will produce food f...,1317254400,A1B1QMGK8VYG80
2,10,6,699,B000G6MBX2,1.0,"The ""Organic"" Label is Misleading","""Yeast Extract"" is listed as an ingredient. So...",1195084800,A1AQ2W2R4SOVGN
3,0,0,265935,B0001GDC4O,5.0,Fresh/Stale,Some of these espresso pods were fresh and som...,1272499200,A2IVH1D3GLACL3
4,1,1,199932,B000EDG430,5.0,Baked to perfection in my bread machine!,"I am not one to write reviews, but this bread ...",1336953600,AEOINN8F4D9DQ
5,2,2,37969,B000H26J7E,2.0,Okay,I'm not sure I'm reviewing exactly the same Li...,1299801600,AUTBHG6070SL4
6,0,0,104755,B000UK3G2Y,5.0,The perfect cup of tea,I first had this company's tea in Seattle. It...,1215302400,A1T1LLS0I1FFPR
7,4,1,362227,B001E6KBJ4,2.0,"Warning, Contains High Fructose Corn Syrup",This was my favorite cereal for years...until ...,1249516800,A34ZQGFACNZRCL
8,2,1,410735,B000EDROXO,1.0,Stale and bad customer service,"The product was stale, 8 months old. My packag...",1341532800,A2OAC8Y7XGFTX7
9,0,0,333863,B000AXSG2Q,4.0,Mostly great.,I really enjoy these pumps for their looks and...,1312761600,A26732RC19B9YE


In [5]:
train_data = raw.dropna(how='any', subset=['Score'])
train_data

Unnamed: 0,HelpfulnessDenominator,HelpfulnessNumerator,Id,ProductId,Score,Summary,Text,Time,UserId
0,0,0,130058,B000CQIDHY,5.0,A worthy and welcome replacement,I don't know what has happened to formulation ...,1337817600,A3VZR9TPF2GERB
1,0,0,91622,B004YV80OE,4.0,"It was okay, good flavor",Kraft's a safe brand. They will produce food f...,1317254400,A1B1QMGK8VYG80
2,10,6,699,B000G6MBX2,1.0,"The ""Organic"" Label is Misleading","""Yeast Extract"" is listed as an ingredient. So...",1195084800,A1AQ2W2R4SOVGN
3,0,0,265935,B0001GDC4O,5.0,Fresh/Stale,Some of these espresso pods were fresh and som...,1272499200,A2IVH1D3GLACL3
4,1,1,199932,B000EDG430,5.0,Baked to perfection in my bread machine!,"I am not one to write reviews, but this bread ...",1336953600,AEOINN8F4D9DQ
5,2,2,37969,B000H26J7E,2.0,Okay,I'm not sure I'm reviewing exactly the same Li...,1299801600,AUTBHG6070SL4
6,0,0,104755,B000UK3G2Y,5.0,The perfect cup of tea,I first had this company's tea in Seattle. It...,1215302400,A1T1LLS0I1FFPR
7,4,1,362227,B001E6KBJ4,2.0,"Warning, Contains High Fructose Corn Syrup",This was my favorite cereal for years...until ...,1249516800,A34ZQGFACNZRCL
8,2,1,410735,B000EDROXO,1.0,Stale and bad customer service,"The product was stale, 8 months old. My packag...",1341532800,A2OAC8Y7XGFTX7
9,0,0,333863,B000AXSG2Q,4.0,Mostly great.,I really enjoy these pumps for their looks and...,1312761600,A26732RC19B9YE


In [6]:
train_i, test_i = train_test_split(np.arange(len(train_data)), train_size=0.8, random_state=1)

train = train_data.loc[train_i]
test = train_data.loc[test_i]

In [7]:
def getwordlist(text):
    words = text.lower().split()
    return words

In [8]:
#clean train
clean_train = []
for review in train['Text']:
    clean_train.append(" ".join(getwordlist(review)))
    
#clean test
clean_test = []
for review in test['Text']:
    clean_test.append(" ".join(getwordlist(review)))


In [9]:
vectorizer =  TfidfVectorizer( max_features = 20000, ngram_range = ( 1, 3 ), 
	sublinear_tf = True )

In [10]:
train_data_features = vectorizer.fit_transform(clean_train)
test_data_features = vectorizer.transform(clean_test)

In [11]:
def train_and_eval_rmse( model, train_x, train_y, test_x, test_y ):
    model.fit( train_x, train_y )
    p = model.predict( test_x )
    mse = mean_squared_error( test_y, p )
    return sqrt(mse)

In [12]:
lr = LogisticRegression(solver='newton-cg', multi_class='multinomial')
rmse = train_and_eval_rmse(lr, train_data_features, train['Score'], test_data_features, test['Score'].values)
print("RMSE is: ", rmse)

RMSE is:  0.8342787864503811


In [37]:
real_test_raw = pd.read_csv('./test.csv')
real_test_raw.head()

Unnamed: 0,Id,Score
0,413937,
1,16525,
2,221883,
3,82207,
4,8354,


In [14]:
real_test = raw.merge(real_test_raw, on='Id', how='inner')

#clean test
clean_real_test = []
for review in real_test['Text']:
    clean_real_test.append(" ".join(getwordlist(review)))

real_test_data_features = vectorizer.transform(clean_real_test)
real_test_data_features

<100000x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 9300831 stored elements in Compressed Sparse Row format>

In [16]:
real_prediction = lr.predict(real_test_data_features)


In [38]:
submission = real_test_raw.drop(labels=['Score'], axis=1)
submission['Score'] = real_prediction
submission

Unnamed: 0,Id,Score
0,413937,1.0
1,16525,3.0
2,221883,5.0
3,82207,1.0
4,8354,5.0
5,483228,5.0
6,350796,5.0
7,121313,5.0
8,494256,1.0
9,260366,5.0


In [39]:
submission.to_csv('submission.csv', index=False)