In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import joblib
import numpy as np


In [10]:
traindf = pd.read_excel('./data/Task-2/train.xlsx')
testdf = pd.read_excel('./data/Task-2/test.xlsx').drop('rid', axis=1)


In [11]:
from imblearn.over_sampling import RandomOverSampler
# trainData.groupby('label').count()

traindf.drop_duplicates(subset='text',inplace=True)
#class balancing
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(traindf['text']).reshape(-1, 1), np.array(traindf['label']).reshape(-1, 1))
traindf_balance = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['text', 'label'])
traindf_balance['label'].value_counts()

label
 1    2822
-1    2822
Name: count, dtype: int64

In [12]:
traindf_balance

Unnamed: 0,text,label
0,Fiskars has a strong portfolio of internationa...,1
1,METALS-Zinc surges 12 pct after Glencore cuts ...,1
2,"According to Scanfil , demand for telecommunic...",-1
3,dbs launches new banking api developer platfor...,1
4,Theodosopoulos said Tellabs could be of value ...,1
...,...,...
5639,"National Conciliator Juhani Salonius , who met...",-1
5640,market fell not solely on rm1 trillion governm...,-1
5641,aramco plans to ship first crude oil to malays...,-1
5642,aramco plans to ship first crude oil to malays...,-1


In [13]:
import time
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
pipeline = Pipeline([
    ('tfidfvec', TfidfVectorizer()),
    ('svm', svm.SVC(probability=True)),
])
# Create feature vectors
# vectorizer = TfidfVectorizer(min_df = 5,
#                              max_df = 0.8,
#                              sublinear_tf = True,
#                              use_idf = True)


In [14]:
parameters = {
    'tfidfvec__max_df': (0.5, 0.75, 1.0),
    'tfidfvec__min_df': [5],
    # 'tfidfvec__min_df': (0.5, 0.75, 1.0),
    'tfidfvec__ngram_range': ((1, 1), (1, 2), (1, 3)),
    'tfidfvec__use_idf': (True, False),
    'tfidfvec__norm': ('l1', 'l2'),
    'svm__C': (1, 10, 100, 1000),
    'svm__kernel':('linear', 'rbf'),
    # 'svm__probability':[True]
}
grid_search = GridSearchCV(pipeline, parameters, scoring='f1', n_jobs=-1, verbose=1, cv=5)

In [15]:
grid_search.fit(traindf_balance['text'],traindf_balance['label'])

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Performing grid search...
pipeline: ['tfidfvec', 'svm']
parameters:
Best score: 0.896
Best parameters set:
	svm__C: 10
	svm__kernel: 'rbf'
	tfidfvec__max_df: 0.5
	tfidfvec__min_df: 5
	tfidfvec__ngram_range: (1, 1)
	tfidfvec__norm: 'l2'
	tfidfvec__use_idf: True


Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Performing grid search...
pipeline: ['tfidfvec', 'svm']
parameters:
Best score: 0.853
Best parameters set:
	svm__C: 1
	svm__kernel: 'linear'
	tfidfvec__max_df: 0.5
	tfidfvec__min_df: 5
	tfidfvec__ngram_range: (1, 3)
	tfidfvec__norm: 'l2'
	tfidfvec__use_idf: True

In [16]:
joblib.dump(grid_search, "text_sentiment_model_svm00.joblib")

['text_sentiment_model_svm00.joblib']

In [17]:
cvresult = pd.DataFrame(grid_search.cv_results_)
cvresult

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svm__C,param_svm__kernel,param_tfidfvec__max_df,param_tfidfvec__min_df,param_tfidfvec__ngram_range,param_tfidfvec__norm,param_tfidfvec__use_idf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,12.292284,0.097109,0.560750,0.027192,1,linear,0.5,5,"(1, 1)",l1,True,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.777778,0.795830,0.776874,0.777570,0.813620,0.788334,0.014520,271
1,11.954178,0.189580,0.518860,0.012573,1,linear,0.5,5,"(1, 1)",l1,False,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.725835,0.746599,0.731010,0.739602,0.763828,0.741375,0.013293,280
2,9.014627,0.219767,0.361824,0.022573,1,linear,0.5,5,"(1, 1)",l2,True,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.829006,0.836237,0.825571,0.834586,0.856604,0.836401,0.010802,259
3,9.270876,0.147565,0.368253,0.021005,1,linear,0.5,5,"(1, 1)",l2,False,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.823009,0.829565,0.810909,0.832090,0.841035,0.827322,0.010038,268
4,15.284053,0.227565,0.674501,0.024376,1,linear,0.5,5,"(1, 2)",l1,True,"{'svm__C': 1, 'svm__kernel': 'linear', 'tfidfv...",0.763736,0.787825,0.759070,0.770833,0.807692,0.777831,0.017839,274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,15.494116,0.651162,0.674211,0.144905,1000,rbf,1.0,5,"(1, 2)",l2,False,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.866033,0.885135,0.867892,0.920574,0.931818,0.894290,0.027123,34
284,16.436730,1.184496,0.562114,0.024313,1000,rbf,1.0,5,"(1, 3)",l1,True,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.867241,0.875740,0.862099,0.917466,0.928775,0.890264,0.027413,109
285,16.055703,0.597328,0.507503,0.061343,1000,rbf,1.0,5,"(1, 3)",l1,False,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.872727,0.878788,0.860870,0.920574,0.924690,0.891530,0.026073,94
286,15.360372,0.424929,0.498348,0.042687,1000,rbf,1.0,5,"(1, 3)",l2,True,"{'svm__C': 1000, 'svm__kernel': 'rbf', 'tfidfv...",0.860364,0.879195,0.869489,0.923664,0.930806,0.892703,0.028905,73


In [18]:
testData['proba'] = np.max(classifier_linear.predict_proba(test_vectors), axis = 1)

NameError: name 'classifier_linear' is not defined

In [None]:
report