In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import joblib
import numpy as np


In [2]:
trainData = pd.read_excel('/app/model/data/Task-2/train.xlsx')
testData = pd.read_excel('/app/model/data/Task-2/test.xlsx').drop('rid', axis=1)

In [3]:
trainData.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
-1,1511
1,2827


In [4]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = Pipeline([
    ('tfidfvec', TfidfVectorizer()),
    ('svm', svm.SVC(probability=True)),
])
# Create feature vectors
# vectorizer = TfidfVectorizer(min_df = 5,
#                              max_df = 0.8,
#                              sublinear_tf = True,
#                              use_idf = True)


In [5]:
parameters = {
    'tfidfvec__max_df': (0.5, 0.75, 1.0),
    'tfidfvec__min_df': [5],
    # 'tfidfvec__min_df': (0.5, 0.75, 1.0),
    'tfidfvec__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'tfidfvec__use_idf': (True, False),
    'tfidfvec__norm': ('l1', 'l2'),
    'svm__C': (1, 10, 100, 1000),
    'svm__kernel':('linear', 'rbf'),
    # 'svm__probability':[True]
}
grid_search = GridSearchCV(pipeline, parameters, scoring='f1', n_jobs=-1, verbose=1, cv=5)

In [6]:
grid_search.fit(trainData['text'], trainData['label'])

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


Performing grid search...
pipeline: ['tfidfvec', 'svm']
parameters:
Best score: 0.853
Best parameters set:
	svm__C: 1
	svm__kernel: 'linear'
	tfidfvec__max_df: 0.5
	tfidfvec__min_df: 5
	tfidfvec__ngram_range: (1, 3)
	tfidfvec__norm: 'l2'
	tfidfvec__use_idf: True


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Performing grid search...
pipeline: ['tfidfvec', 'svm']
parameters:
Best score: 0.853
Best parameters set:
	svm__C: 1
	svm__kernel: 'linear'
	tfidfvec__max_df: 0.5
	tfidfvec__min_df: 5
	tfidfvec__ngram_range: (1, 3)
	tfidfvec__norm: 'l2'
	tfidfvec__use_idf: True

In [8]:
joblib.dump(grid_search, "text_sentiment_model_svm00.joblib")

['text_sentiment_model_svm00.joblib']

In [7]:
cvresult = pd.DataFrame(grid_search.cv_results_)
cvresult

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svm__C,param_svm__kernel,param_tfidfvec__max_df,param_tfidfvec__min_df,param_tfidfvec__ngram_range,param_tfidfvec__norm,param_tfidfvec__use_idf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,4.289687,0.270460,0.182254,0.017979,1,linear,0.5,5,"(1, 1)",l1,True,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.794926,0.797745,0.793517,0.794908,0.793249,0.794869,0.001596,256
1,4.178591,0.316525,0.182395,0.026076,1,linear,0.5,5,"(1, 1)",l1,False,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.793539,0.795502,0.794097,0.792958,0.792135,0.793646,0.001133,262
2,3.431350,0.149906,0.146118,0.015104,1,linear,0.5,5,"(1, 1)",l2,True,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.858340,0.853112,0.839798,0.833751,0.852010,0.847402,0.009140,73
3,3.470668,0.078368,0.142016,0.002219,1,linear,0.5,5,"(1, 1)",l2,False,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.849338,0.847791,0.836634,0.835821,0.850459,0.844008,0.006415,142
4,4.807766,0.249702,0.243342,0.020404,1,linear,0.5,5,"(1, 2)",l1,True,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.792162,0.791055,0.791317,0.789068,0.790470,0.790814,0.001029,271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,6.431461,0.322735,0.278572,0.033373,1000,rbf,1.0,5,"(1, 2)",l2,False,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.857627,0.865132,0.840407,0.824121,0.855196,0.848497,0.014592,46
284,6.449466,0.288107,0.256542,0.025392,1000,rbf,1.0,5,"(1, 3)",l1,True,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.853016,0.861614,0.835184,0.827296,0.846615,0.844745,0.012267,124
285,6.686903,0.360644,0.260586,0.016122,1000,rbf,1.0,5,"(1, 3)",l1,False,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.853265,0.862583,0.830093,0.835000,0.839655,0.844119,0.012036,136
286,6.520634,0.211792,0.242648,0.007022,1000,rbf,1.0,5,"(1, 3)",l2,True,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.859797,0.863934,0.840678,0.834315,0.854715,0.850688,0.011340,16


In [None]:
testData['proba'] = np.max(classifier_linear.predict_proba(test_vectors), axis = 1)

In [None]:
report

{'-1': {'precision': 0.8942307692307693,
  'recall': 0.800132362673726,
  'f1-score': 0.8445686342996855,
  'support': 1511},
 '1': {'precision': 0.898861352980576,
  'recall': 0.9494163424124513,
  'f1-score': 0.9234474453810425,
  'support': 2827},
 'accuracy': 0.8974181650530199,
 'macro avg': {'precision': 0.8965460611056726,
  'recall': 0.8747743525430887,
  'f1-score': 0.884008039840364,
  'support': 4338},
 'weighted avg': {'precision': 0.8972484410289951,
  'recall': 0.8974181650530199,
  'f1-score': 0.8959725990131472,
  'support': 4338}}