In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression



## Reading TFIDF Avg w2v vectors(200-D)

In [2]:
tfidf_avg_w2v =pd.read_pickle("tfidf_avg_vec_200")
print(tfidf_avg_w2v.shape)
tfidf_avg_w2v.head()

(122109, 201)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,-0.00054,0.000332,0.000313,-0.000194,-8.3e-05,0.000283,-2e-05,0.000859,0.000307,0.000461,...,-0.00069,9.5e-05,0.000306,0.000165,0.000459,-0.001187,-0.000208,0.000765,0.000812,1
1,-0.000146,-6.6e-05,-1e-05,-0.000457,0.000192,-5e-06,-3.9e-05,0.000225,0.000138,-0.000238,...,-1.9e-05,0.000137,-0.000356,-0.000212,0.00022,8e-06,-0.000381,5.4e-05,0.000434,1
2,0.000432,-0.000329,0.000599,-0.000644,-0.000191,-0.000523,-0.000381,0.000843,-0.000436,-0.000367,...,-0.000331,-0.000169,-0.000617,0.000503,-0.000637,3.6e-05,-0.000775,-0.000453,-0.000177,0
3,0.000268,-0.000137,-0.000365,-0.000223,-0.000257,-7.2e-05,-0.000135,9.9e-05,3e-05,0.000237,...,1.1e-05,0.000123,0.000118,3.5e-05,-2.4e-05,0.000655,-0.000122,-6.4e-05,-0.00031,1
4,-0.000723,0.000649,-0.000167,0.000996,-0.000168,0.000339,8e-05,-0.000769,0.000997,-0.000348,...,-0.000674,-8.5e-05,-0.000134,0.000342,-0.000405,-0.000941,-0.000654,0.000165,0.000779,1


In [3]:
x =tfidf_avg_w2v.iloc[:,:200]
y =tfidf_avg_w2v.iloc[:,200]
print(x.shape)
y.shape

(122109, 200)


(122109,)

## Column Standardization

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
a =StandardScaler()
x =a.fit_transform(x)

In [6]:
x_train,x_test,y_train,y_test =train_test_split(x,y,test_size =0.3)

In [7]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
y_test.shape

(85476, 200)
(36633, 200)
(85476,)


(36633,)

In [8]:
tuned_params = [{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}]

In [9]:
model = GridSearchCV(LogisticRegression(), tuned_params, scoring = 'accuracy')
model.fit(x_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [10]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [11]:
print(model.best_estimator_)
print(model.score(x_test, y_test))

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
0.696339366145


In [12]:
y_pred = model.best_estimator_.predict(x_test)

In [13]:
acc = accuracy_score(y_test, y_pred, normalize=True) * float(100)
acc

69.633936614527883

We can clearly see increasing dimenions the model performs better than 100-D as the chances of data becomes linearly separable in higher dimensions increases.So we get the best hyperplane as dimensions increases due to which model behaves better.

## Confusion Matrix

In [14]:
confusion_matrix(y_test,y_pred).T

array([[11064,  5139],
       [ 5985, 14445]], dtype=int64)

## L1 regularization and sparsity

In [15]:
clf = LogisticRegression(C=1, penalty = 'l1')
clf.fit(x_train, y_train)
w = clf.coef_    
print(np.count_nonzero(w))

199


This means 199 out of 200 are non zero.

In [16]:
clf = LogisticRegression(C=0.1, penalty = 'l1')
clf.fit(x_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

198


In [17]:
clf = LogisticRegression(C=0.001, penalty = 'l1')
clf.fit(x_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

91


In [18]:
clf = LogisticRegression(C=0.00025, penalty = 'l1')
clf.fit(x_train, y_train)
w = clf.coef_
print(np.count_nonzero(w))

1


At c=0.00025 we are getting top 1 features and rest becomes zero.