# Load data

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('training_data.csv')
test = pd.read_csv('test_data.csv')
print(test.head(5))
print(df.head(5))
outid = test['review_id'].values # dataframe to numpy
#print(outid.T)
#print(len(outid))
train = df

   review_id  business_id  user_id  \
0       2713          668     2196   
1       4734         1014     3521   
2       5598         2939     3028   
3       9545         4077     6200   
4       1471         1126     1268   

                                                text        date  
0  Ticoz is not a bad place.  The menu is eclecti...  2009-01-04  
1  Wow!\r\nBeing a Phoenix native, we have enjoye...  2010-10-05  
2  Jodi is fantastic!  Extraordinary!  After too ...  2011-04-29  
3  T&S Glass did a wonderful job replacing my car...  2007-10-02  
4  My experience was on a random Saturday afterno...  2010-08-16  
   review_id  business_id  user_id  \
0       3223         2055     2533   
1       9938         4165     6371   
2       7123          869     4929   
3       3601         1603     2789   
4       3948         2347     1245   

                                                text        date  stars  
0  Sometimes things happen, and when they do this...  2010-12-30  

# Sum of the rating-star distribution in the test data

In [2]:
z = df.sort_values('stars')
print(z['stars'].value_counts().head(10) )

4    2820
5    2669
3    1168
2     741
1     599
Name: stars, dtype: int64


# Use Logistic Regression
## Use count vectorizer

In [3]:
X_train = df.drop(['review_id','business_id','user_id','date'], axis=1)
#X_test = test.drop(['review_id','business_id','user_id','date'], axis=1)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
vectorizer = CountVectorizer()
vectorizer.fit(train['text'])

#X_train = train.drop(['stars'],axis = 1).as_matrix()
y_train = train['stars'].as_matrix()
#X_test = test.drop(['stars'],axis = 1).as_matrix()

X_train = vectorizer.transform(train['text'])
X_test = vectorizer.transform(test['text'])
print((len(vectorizer.vocabulary_)))

clf = LogisticRegression()
clf.fit(X_train,y_train)
pred_y = clf.predict(X_test)

print(clf.score(X_train, y_train))

output = np.vstack((outid,pred_y)).T
output = pd.DataFrame(output,columns=['user_id','stars'])
np.savetxt("CV_LR.csv", output, fmt='%s', delimiter=",")

  # Remove the CWD from sys.path while we load stuff.


26351
0.9757409028385645


## Use tf-idf

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = df.drop(['review_id','business_id','user_id','date'], axis=1)

vectorizer = TfidfVectorizer(ngram_range=(1,3))
vectorizer.fit(train['text'])

X_train = vectorizer.transform(train['text'])
X_test = vectorizer.transform(test['text'])
y_train = train['stars'].as_matrix()

clf = LogisticRegression()
clf.fit(X_train, y_train)

pred_y = clf.predict(X_test)

print(clf.score(X_train, y_train))

output = np.vstack((outid,pred_y)).T
output = pd.DataFrame(output,columns=['user_id','stars'])
np.savetxt("Tfid_LR.csv", output, fmt='%s', delimiter=",")

  # Remove the CWD from sys.path while we load stuff.


0.8035513317494061


# Use SGD classifier + tf-idf

In [5]:
from sklearn.linear_model import SGDClassifier

X_train = df.drop(['review_id','business_id','user_id','date'], axis=1)

vectorizer = TfidfVectorizer()
vectorizer.fit(train['text'])

X_train = vectorizer.transform(train['text'])
X_test = vectorizer.transform(test['text'])
y_train = train['stars'].as_matrix()

clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf = clf.fit(X_train, y_train)
pred_y = clf.predict(X_test)

print(clf.score(X_train, y_train))

output = np.vstack((outid,pred_y)).T
output = pd.DataFrame(output,columns=['user_id','stars'])
np.savetxt("SGDC.csv", output, fmt='%s', delimiter=",")

0.7097661623108665


  # Remove the CWD from sys.path while we load stuff.


# Use the pipeline (include CountVectorizer, TfidfTransformer, SGDClassifier)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

X_train = df.drop(['review_id','business_id','user_id','date','stars'], axis=1)

'''
vectorizer = TfidfVectorizer()
vectorizer.fit(train['text'])

X_train = vectorizer.transform(train['text'])
'''

y_train = train['stars'].as_matrix()

text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            max_iter=5, tol=None)),
 ])
text_clf.fit(train['text'], y_train)
pred_y = text_clf.predict(test['text'])

output = np.vstack((outid,pred_y)).T
output = pd.DataFrame(output,columns=['user_id','stars'])
np.savetxt("Pipe.csv", output, fmt='%s', delimiter=",")        


  del sys.path[0]


# Use LinearSVC

In [7]:
from sklearn.svm import LinearSVC

X_train = df.drop(['review_id','business_id','user_id','date'], axis=1)

vectorizer = TfidfVectorizer()
vectorizer.fit(train['text'])

X_train = vectorizer.transform(train['text'])
X_test = vectorizer.transform(test['text'])
y_train = train['stars'].as_matrix()

clf = LinearSVC()
clf = clf.fit(X_train, y_train)
pred_y = clf.predict(X_test)

print(clf.score(X_train, y_train))

output = np.vstack((outid,pred_y)).T
output = pd.DataFrame(output,columns=['user_id','stars'])
np.savetxt("L_SVC.csv", output, fmt='%s', delimiter=",")

  # Remove the CWD from sys.path while we load stuff.


0.9707390271351757


# Use adaboost

In [8]:
from sklearn import cross_validation, ensemble, preprocessing, metrics

X_train = df.drop(['review_id','business_id','user_id','date'], axis=1)

vectorizer = TfidfVectorizer()
vectorizer.fit(train['text'])

X_train = vectorizer.transform(train['text'])
X_test = vectorizer.transform(test['text'])
y_train = train['stars'].as_matrix()

clf = ensemble.AdaBoostClassifier(n_estimators = 200)
clf = clf.fit(X_train, y_train)
pred_y = clf.predict(X_test)

print(clf.score(X_train, y_train))

output = np.vstack((outid,pred_y)).T
output = pd.DataFrame(output,columns=['user_id','stars'])
np.savetxt("adaboost.csv", output, fmt='%s', delimiter=",")

  # Remove the CWD from sys.path while we load stuff.


0.5256971364261598
