In [1]:
import pandas as pd
from utils import blabla, print_report, predict_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

from sklearn.utils import shuffle
from collections import Counter
import pickle


PICKLED_DIR = "pickled_files"
DATA_DIR = "data"

In [2]:
training_df = shuffle(pd.read_csv("%s/train.tsv" %DATA_DIR, error_bad_lines=False , sep='\t'))[:]

In [3]:
print training_df.groupby(['Sentiment']).size()

Sentiment
0     7072
1    27273
2    79582
3    32927
4     9206
dtype: int64


# Normalize Sentiment column values from
#### [Negative:0, Somewhat Negative:1, Neutral:2, Somewhat Positive:3, Positive:4]  to [Negative:-1, Neutral:0, Positive:1]

In [4]:
training_df['3class_sent'] = training_df['Sentiment'].map({0:-1, 1:-1, 2:0, 3:1, 4:1})
print training_df.groupby(['3class_sent']).size()

3class_sent
-1    34345
 0    79582
 1    42133
dtype: int64


# Build the 3 Classifiers and pickle them and get their X_test/Y_test
### *Read blabla for more details

In [5]:
%%time
X_test1, Y_test1 = blabla(training_df, c_no=1, on_=1)#, label_col='3class_sent')
X_test2, Y_test2 = blabla(training_df[training_df['Sentiment']<2], c_no=2, on_=0)#, label_col="Sentiment")
X_test3, Y_test3 = blabla(training_df[training_df['Sentiment']>2], c_no=3, on_=0)#, label_col="Sentiment")



CPU times: user 20.6 s, sys: 198 ms, total: 20.8 s
Wall time: 20.8 s


# Load 3 classifiers and thier vectorizers

In [6]:
v1 = pickle.load(open("%s/vectorizer_1.pickle" %PICKLED_DIR, "rb"))
model1 = pickle.load(open("%s/model_1.pickle" %PICKLED_DIR, "rb"))

v2 = pickle.load(open("%s/vectorizer_2.pickle" %PICKLED_DIR, "rb"))
model2 = pickle.load(open("%s/model_2.pickle" %PICKLED_DIR, "rb"))

v3 = pickle.load(open("%s/vectorizer_3.pickle" %PICKLED_DIR, "rb"))
model3 = pickle.load(open("%s/model_3.pickle" %PICKLED_DIR, "rb"))

# Report results of Classifier-1 on all data on '3c_sentiment' column

In [7]:
print_report(Y_test1[:, 1], model1.predict(v1.transform(X_test1)), ['NEGATIVE', 'NEUTRAL', 'POSITIVE'])

             precision    recall  f1-score   support

   NEGATIVE       0.73      0.66      0.69     10389
    NEUTRAL       0.75      0.81      0.78     23801
   POSITIVE       0.78      0.71      0.74     12628

avg / total       0.75      0.75      0.75     46818

accuracy :  0.750929129822


# Report results of Classifier-2 on data['Sentiment']<2  

In [8]:
print_report(Y_test2[:, 0], model2.predict(v2.transform(X_test2)), ['NEGATIVE', 'SOMEWHAT NEGATIVE'])

                   precision    recall  f1-score   support

         NEGATIVE       0.44      0.55      0.49      2092
SOMEWHAT NEGATIVE       0.88      0.83      0.85      8212

      avg / total       0.79      0.77      0.78     10304

accuracy :  0.768827639752


# Report results of Classifier-3 on data['Sentiment']>2  

In [9]:
print_report(model3.predict(v3.transform(X_test3)), Y_test3[:, 0], ['SOMEWHAT POSITIVE', 'POSITIVE'])

                   precision    recall  f1-score   support

SOMEWHAT POSITIVE       0.81      0.88      0.85      9136
         POSITIVE       0.60      0.48      0.53      3504

      avg / total       0.76      0.77      0.76     12640

accuracy :  0.76835443038


# Run Pipeline to predict coming data

In [10]:
pred_sent = predict_pipeline(X_test1, model1, v1, model2, v2, model3, v3)

# Report results on testing_data

In [11]:
print_report(pred_sent, Y_test1[:, 0], ['NEGATIVE', 'SOMEWHAT NEGATIVE', 'NEUTRAL', 'SOMEWHAT POSITIVE', 'POSITIVE'])

                   precision    recall  f1-score   support

         NEGATIVE       0.77      0.56      0.65      2989
SOMEWHAT NEGATIVE       0.48      0.61      0.54      6432
          NEUTRAL       0.81      0.75      0.78     25869
SOMEWHAT POSITIVE       0.52      0.66      0.58      7742
         POSITIVE       0.80      0.59      0.68      3786

      avg / total       0.71      0.69      0.70     46818

accuracy :  0.688602674185


# Submission section

In [12]:
test_df = pd.read_csv("%s/test.tsv" %DATA_DIR, error_bad_lines=False , sep='\t')

In [13]:
# test_res = predict_pipeline(test_df['Phrase'].tolist(), model1, v1, model2, v2, model3, v3)

# tdf = pd.DataFrame({'PhraseId': test_df['PhraseId'].tolist(), 'Sentiment': test_res})
# tdf.to_csv("3classes_sent_SVM.csv", index=False)