In [6]:
import numpy as np
import pandas as pd
import os
import sklearn.linear_model
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold
import sklearn.ensemble
import matplotlib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
data_dir = 'data_reviews'
x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
x_test_df = pd.read_csv(os.path.join(data_dir, 'x_test.csv'))
N, n_cols = x_train_df.shape
print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
print("Shape of y_train_df: %s" % str(y_train_df.shape))

# Print out the first five rows and last five rows
tr_text_list = x_train_df['text'].values.tolist()

Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)


In [126]:
kf = KFold(n_splits=3, shuffle=True)
stop = ['and', 'for', 'in', 'is', 'it', 'of', 'the', 'this', 'to', 'was','are']

In [127]:
BOW_vectorizer = TfidfVectorizer(max_features = 100,stop_words = stop, ngram_range = (1,1),lowercase = True,max_df = 1.4, min_df = 1)
BOW_vectorizer.fit_transform(tr_text_list)
BOW_vectorizer.get_feature_names()
BOW_vectorizer.transform(tr_text_list).toarray()
Train_list = BOW_vectorizer.transform(x_train_df['text'].values.tolist()).toarray()
Test_list = BOW_vectorizer.transform(x_test_df['text'].values.tolist()).toarray()

In [128]:
BOW_vectorizer.get_feature_names()

['about',
 'after',
 'again',
 'all',
 'also',
 'an',
 'any',
 'as',
 'at',
 'back',
 'bad',
 'be',
 'because',
 'been',
 'best',
 'better',
 'but',
 'by',
 'can',
 'could',
 'did',
 'didn',
 'do',
 'don',
 'even',
 'ever',
 'excellent',
 'film',
 'first',
 'food',
 'from',
 'get',
 'go',
 'good',
 'great',
 'had',
 'has',
 'have',
 'he',
 'headset',
 'here',
 'his',
 'how',
 'if',
 'just',
 'like',
 'love',
 'made',
 'make',
 'me',
 'more',
 'movie',
 'much',
 'my',
 'nice',
 'no',
 'not',
 'on',
 'one',
 'only',
 'or',
 'our',
 'out',
 'phone',
 'place',
 'product',
 'quality',
 'really',
 'recommend',
 'see',
 'service',
 'so',
 'some',
 'sound',
 'than',
 'that',
 'them',
 'there',
 'they',
 'time',
 'too',
 'up',
 'use',
 've',
 'very',
 'way',
 'we',
 'well',
 'were',
 'what',
 'when',
 'which',
 'who',
 'will',
 'with',
 'work',
 'works',
 'would',
 'you',
 'your']

In [108]:
logistic = sklearn.linear_model.LogisticRegression(C=1000, max_iter=100,penalty='l1',solver='saga')
logistic_hyperparameter_grid_by_name = dict(
    C = [0.1, 1, 10, 100, 1000],
    max_iter = [150,200],
    )
logistic_search = sklearn.model_selection.GridSearchCV(
    logistic,
    logistic_hyperparameter_grid_by_name,
    scoring = 'balanced_accuracy',
    cv = kf,
    return_train_score=True,
    refit=False) # TODO

logistic_search.fit(Train_list, y_train_df.values.reshape(-1,))
logistic_search_results_df = pd.DataFrame(logistic_search.cv_results_).copy()



In [102]:
print(logistic_search.best_params_)
print(logistic_search.best_score_)

{'C': 1, 'max_iter': 150}
0.7840792027643376


In [38]:
logistic = sklearn.linear_model.LogisticRegression(C=1000, max_iter=150,penalty='l1',solver='saga')
logistic.fit(Train_list, y_train_df.values.reshape(-1,))

yproba_te = logistic.predict(Test_list)
np.savetxt('yproba1_test.txt', yproba_te, delimiter='\n')
yproba_te



array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,

In [None]:
from sklearn.neural_network import MLPClassifier
mpl = MLPClassifier(
        hidden_layer_sizes=50,
        activation='relu',
        alpha=0.0001,
        tol=1e-5,
        n_iter_no_change=50,
        max_iter=1000,
        solver='sgd',
        batch_size=500,
        learning_rate='adaptive', learning_rate_init=0.2, momentum=0.0,
        )
mpl_hyperparameter_grid_by_name = dict(
    batch_size = [20,50,100],
    learning_rate_init = [0.3,0.5,1]
    )
mpl_searcher = sklearn.model_selection.GridSearchCV(
    mpl,
    mpl_hyperparameter_grid_by_name,
    scoring = 'balanced_accuracy',
    cv = kf,
    return_train_score=True,
    refit=False) # TODO
mpl_searcher.fit(Train_list, y_train_df.values.reshape(-1,))
mpl_search_results_df = pd.DataFrame(mpl_searcher.cv_results_).copy()



In [None]:
print(mpl_searcher.best_params_)
print(mpl_searcher.best_score_)