# Import the Needed Libraries 

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


from collections import Counter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Intro to NLP Lab

In this lab, you'll be classifying randomly selected tweets from political officials into whether or not they are partisan tweets or neutral. In the following import statement, we're selecting only the columns that are important, but there may be more useful features in that set. Feel free to explore. 

In [13]:
import pandas as pd

df = pd.read_csv('datasets/political_media.csv',
                usecols=[7, 20])
df.head()

Unnamed: 0,bias,text
0,partisan,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...
1,partisan,VIDEO - #Obamacare: Full of Higher Costs and ...
2,neutral,Please join me today in remembering our fallen...
3,neutral,RT @SenatorLeahy: 1st step toward Senate debat...
4,partisan,.@amazon delivery #drones show need to update ...


In [14]:
df.bias.value_counts()

neutral     3689
partisan    1311
Name: bias, dtype: int64

## Set up

Please split the dataset into a training and test set and convert the `bias` feature into 0s and 1s.

In [15]:
df['bias'] = df['bias'].apply(lambda x: 1 if x =='partisan' else 0)

In [16]:
df.bias.value_counts()

0    3689
1    1311
Name: bias, dtype: int64

In [17]:
df.shape

(5000, 2)

In [18]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['bias']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

## Modeling

Please try the following techniques to transform the data. For each technique, do the following:

1. Transform the training data
2. Fit a `RandomForestClassifier` to the transformed training data
3. Transform the test data
4. Discuss the goodness of fit of your model using the test data and a classification report and confusion matrix

### 1. `CountVectorizer()`

In [40]:
'''Setting up the CountVectoriser'''
cv = CountVectorizer()
X_fit = cv.fit(X_train)

X_train_transformed = X_fit.transform(X_train)
X_test_transformed = X_fit.transform(X_test)


'''Fitting the RandomForestClassifier Model on Training'''
rf = RandomForestClassifier()
rf_fit = rf.fit(X_train_transformed, y_train)


'''Validating the score on Training and Test Sets'''
train_rf_score = rf.score(X_train_transformed, y_train)
print ('Training Score: ', train_rf_score)

test_rf_score = rf.score(X_test_transformed, y_test)
print ('Test Score: ', test_rf_score)


'''Model Evaluation'''

# Confusion Matrix 
conf_matrix = confusion_matrix(y_test, rf.predict(X_test_transformed))
print (conf_matrix)

# Classification Report 
class_report = classification_report(y_test, rf.predict(X_test_transformed))
print (class_report)

Training Score:  0.975223880597
Test Score:  0.721818181818
[[1146   55]
 [ 404   45]]
             precision    recall  f1-score   support

          0       0.74      0.95      0.83      1201
          1       0.45      0.10      0.16       449

avg / total       0.66      0.72      0.65      1650



We noticed that the training set seems to perform better but the test seem does not seem to perform as expected

### 2. `CountVectorizer()` with your choice of `min_df` and `max_df`

In [41]:
'''Setting up the CountVectoriser'''
cv = CountVectorizer(min_df=0.10, max_df=0.90)
X_fit = cv.fit(X_train)

X_train_transformed = X_fit.transform(X_train)
X_test_transformed = X_fit.transform(X_test)


'''Fitting the RandomForestClassifier Model on Training'''
rf = RandomForestClassifier()
rf_fit = rf.fit(X_train_transformed, y_train)


'''Validating the score on Training and Test Sets'''
train_rf_score = rf.score(X_train_transformed, y_train)
print ('Training Score: ', train_rf_score)

test_rf_score = rf.score(X_test_transformed, y_test)
print ('Test Score: ', test_rf_score)


'''Model Evaluation'''

# Confusion Matrix 
conf_matrix = confusion_matrix(y_test, rf.predict(X_test_transformed))
print (conf_matrix)

# Classification Report 
class_report = classification_report(y_test, rf.predict(X_test_transformed))
print (class_report)

Training Score:  0.939104477612
Test Score:  0.703636363636
[[1082  119]
 [ 370   79]]
             precision    recall  f1-score   support

          0       0.75      0.90      0.82      1201
          1       0.40      0.18      0.24       449

avg / total       0.65      0.70      0.66      1650



We noticed that the training set seems to perform better but the test seem does not seem to perform as expected

### 3. `CountVectorizer()` with English stop words

In [42]:
'''Setting up the CountVectoriser'''
cv = CountVectorizer(stop_words='english')
X_fit = cv.fit(X_train)

X_train_transformed = X_fit.transform(X_train)
X_test_transformed = X_fit.transform(X_test)


'''Fitting the RandomForestClassifier Model on Training'''
rf = RandomForestClassifier()
rf_fit = rf.fit(X_train_transformed, y_train)


'''Validating the score on Training and Test Sets'''
train_rf_score = rf.score(X_train_transformed, y_train)
print ('Training Score: ', train_rf_score)

test_rf_score = rf.score(X_test_transformed, y_test)
print ('Test Score: ', test_rf_score)


'''Model Evaluation'''

# Confusion Matrix 
conf_matrix = confusion_matrix(y_test, rf.predict(X_test_transformed))
print (conf_matrix)

# Classification Report 
class_report = classification_report(y_test, rf.predict(X_test_transformed))
print (class_report)

Training Score:  0.971940298507
Test Score:  0.74
[[1146   55]
 [ 374   75]]
             precision    recall  f1-score   support

          0       0.75      0.95      0.84      1201
          1       0.58      0.17      0.26       449

avg / total       0.71      0.74      0.68      1650



### 4. `TfidfVectorizer()` 

In [44]:
'''Setting up the CountVectoriser'''
tf = TfidfVectorizer()
X_fit = tf.fit(X_train)

X_train_transformed = X_fit.transform(X_train)
X_test_transformed = X_fit.transform(X_test)


'''Fitting the RandomForestClassifier Model on Training'''
rf = RandomForestClassifier()
rf_fit = rf.fit(X_train_transformed, y_train)


'''Validating the score on Training and Test Sets'''
train_rf_score = rf.score(X_train_transformed, y_train)
print ('Training Score: ', train_rf_score)

test_rf_score = rf.score(X_test_transformed, y_test)
print ('Test Score: ', test_rf_score)


'''Model Evaluation'''

# Confusion Matrix 
conf_matrix = confusion_matrix(y_test, rf.predict(X_test_transformed))
print (conf_matrix)

# Classification Report 
class_report = classification_report(y_test, rf.predict(X_test_transformed))
print (class_report)

Training Score:  0.974626865672
Test Score:  0.726666666667
[[1145   56]
 [ 395   54]]
             precision    recall  f1-score   support

          0       0.74      0.95      0.84      1201
          1       0.49      0.12      0.19       449

avg / total       0.67      0.73      0.66      1650



### 5. `TfidfVectorizer()` with English stop words

In [45]:
'''Setting up the CountVectoriser'''
tf = TfidfVectorizer(stop_words='english')
X_fit = tf.fit(X_train)

X_train_transformed = X_fit.transform(X_train)
X_test_transformed = X_fit.transform(X_test)


'''Fitting the RandomForestClassifier Model on Training'''
rf = RandomForestClassifier()
rf_fit = rf.fit(X_train_transformed, y_train)


'''Validating the score on Training and Test Sets'''
train_rf_score = rf.score(X_train_transformed, y_train)
print ('Training Score: ', train_rf_score)

test_rf_score = rf.score(X_test_transformed, y_test)
print ('Test Score: ', test_rf_score)


'''Model Evaluation'''

# Confusion Matrix 
conf_matrix = confusion_matrix(y_test, rf.predict(X_test_transformed))
print (conf_matrix)

# Classification Report 
class_report = classification_report(y_test, rf.predict(X_test_transformed))
print (class_report)

Training Score:  0.973432835821
Test Score:  0.750303030303
[[1148   53]
 [ 359   90]]
             precision    recall  f1-score   support

          0       0.76      0.96      0.85      1201
          1       0.63      0.20      0.30       449

avg / total       0.73      0.75      0.70      1650



### Moving forward

With the remainder of your time, please try and find the best model and data transformation to predict partisan tweets. This is a challenging data set and can be approached from a number of ways.

Some techniques to try are:

1. Different types of data transformation 
2. Custom preprocessors for `CountVectorizer`
3. Custom stopword lists
4. Use of a dimensionality reduction technique (like `TruncatedSVD`)
5. Optimizing hyperparameters using `GridSearchCV`
6. Trying a different modeling technique such as `KNeighborsClassifier` or `LogisticRegression`

###### Testing out the Model with the RandomClassifier Apporach. We also tried to hypertune some of the Hyper Parameters

In [55]:
'''Setting up the CountVectoriser'''
tf = TfidfVectorizer(stop_words='english')
X_fit = tf.fit(X_train)

X_train_transformed = X_fit.transform(X_train)
X_test_transformed = X_fit.transform(X_test)


params = {
    'n_estimators': [50,100,150], 
    'criterion' : ['gini', 'entropy'], 
    'max_depth': [4,6,8]
    
}

'''Fitting the RandomForestClassifier Model on Training'''
rf = RandomForestClassifier()
gv_rf = GridSearchCV(rf, param_grid=params,n_jobs = -1, verbose=2)
gv_rf.fit(X_train_transformed, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] criterion=gini, max_depth=4, n_estimators=50 ....................
[CV] criterion=gini, max_depth=4, n_estimators=50 ....................
[CV] criterion=gini, max_depth=4, n_estimators=50 ....................
[CV] criterion=gini, max_depth=4, n_estimators=100 ...................
[CV] ..... criterion=gini, max_depth=4, n_estimators=50, total=   0.3s
[CV] ..... criterion=gini, max_depth=4, n_estimators=50, total=   0.3s
[CV] criterion=gini, max_depth=4, n_estimators=100 ...................
[CV] ..... criterion=gini, max_depth=4, n_estimators=50, total=   0.3s
[CV] criterion=gini, max_depth=4, n_estimators=150 ...................
[CV] criterion=gini, max_depth=4, n_estimators=100 ...................
[CV] .... criterion=gini, max_depth=4, n_estimators=100, total=   0.5s
[CV] criterion=gini, max_depth=4, n_estimators=150 ...................
[CV] .... criterion=gini, max_depth=4, n_estimators=100, total=   0.4s
[CV] criterion=g

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    5.7s


[CV] .. criterion=entropy, max_depth=6, n_estimators=50, total=   0.3s
[CV] criterion=entropy, max_depth=6, n_estimators=50 .................
[CV] . criterion=entropy, max_depth=4, n_estimators=150, total=   0.8s
[CV] criterion=entropy, max_depth=6, n_estimators=50 .................
[CV] . criterion=entropy, max_depth=4, n_estimators=150, total=   0.8s
[CV] criterion=entropy, max_depth=6, n_estimators=100 ................
[CV] . criterion=entropy, max_depth=4, n_estimators=150, total=   0.8s
[CV] criterion=entropy, max_depth=6, n_estimators=100 ................
[CV] .. criterion=entropy, max_depth=6, n_estimators=50, total=   0.3s
[CV] criterion=entropy, max_depth=6, n_estimators=100 ................
[CV] .. criterion=entropy, max_depth=6, n_estimators=50, total=   0.3s
[CV] criterion=entropy, max_depth=6, n_estimators=150 ................
[CV] . criterion=entropy, max_depth=6, n_estimators=100, total=   0.7s
[CV] criterion=entropy, max_depth=6, n_estimators=150 ................
[CV] .

[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    9.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [50, 100, 150], 'criterion': ['gini', 'entropy'], 'max_depth': [4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=2)

In [56]:
print ('Best Estimator: ', gv_rf.best_estimator_)
print ('Best Score: ', gv_rf.best_score_)

Best Estimator:  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Best Score:  0.742686567164


In [57]:
'''Validating the score on Training and Test Sets'''
train_rf_score = gv_rf.score(X_train_transformed, y_train)
print ('Training Score: ', train_rf_score)

test_rf_score = gv_rf.score(X_test_transformed, y_test)
print ('Test Score: ', test_rf_score)


'''Model Evaluation'''

# Confusion Matrix 
conf_matrix = confusion_matrix(y_test, gv_rf.predict(X_test_transformed))
print (conf_matrix)

# Classification Report 
class_report = classification_report(y_test, gv_rf.predict(X_test_transformed))
print (class_report)

Training Score:  0.742686567164
Test Score:  0.727878787879
[[1201    0]
 [ 449    0]]
             precision    recall  f1-score   support

          0       0.73      1.00      0.84      1201
          1       0.00      0.00      0.00       449

avg / total       0.53      0.73      0.61      1650



  'precision', 'predicted', average, warn_for)


Training Score:  0.742686567164
Test Score:  0.727878787879
[[1201    0]
 [ 449    0]]
             precision    recall  f1-score   support

          0       0.73      1.00      0.84      1201
          1       0.00      0.00      0.00       449

avg / total       0.53      0.73      0.61      1650



  'precision', 'predicted', average, warn_for)


# Grid Search and Pipeline with Logisitic Regression 

In [89]:
'''Setting up the TFIDVectorizer'''

tf = TfidfVectorizer(stop_words='english')
X_fit = tf.fit(X_train)

X_train_transformed = X_fit.transform(X_train)
X_test_transformed = X_fit.transform(X_test)

logreg = LogisticRegression()


'''Setting up the Parameters for GridSearch'''
params = {
    
#     'vect__ngram_range': [(1,1)],
    'logreg__penalty': ['l1', 'l2'], 
    'logreg__C': [1.0,10,100], 
    'logreg__max_iter': [100,150,200]   
}

'''Setting the Pipeline'''
logreg_tk_pipe = Pipeline([('vect', tf), 
                     ('logreg', logreg)])

'''Fitting the Model on Training'''
gs_logreg = GridSearchCV(logreg_tk_pipe, param_grid=params,n_jobs = -1, verbose=2, scoring='accuracy')
gs_logreg.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1 .........
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1 .........
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1 .........
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1, total=   0.2s
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1, total=   0.2s
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2 .........
[CV] logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l1, total=   0.2s
[CV] logreg__C=1.0, logreg__max_iter=150, logreg__penalty=l1 .........
[CV]  logreg__C=1.0, logreg__max_iter=100, logreg__penalty=l2, total=   0.3s
[CV] logreg__C=1.0, logreg__max_iter=150, logreg__penalty=l1 .........
[CV]  logreg__C=1.0, logreg__max_iter=150, logreg__penalty=l1, 

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   18.5s


[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l1, total=   0.3s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l1 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l1, total=   0.3s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l1 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l1, total=   0.3s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l2, total=   0.3s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l2, total=   0.3s
[CV] logreg__C=100, logreg__max_iter=100, logreg__penalty=l2 .........
[CV]  logreg__C=100, logreg__max_iter=100, logreg__penalty=l2, total=   0.3s
[CV] logreg__C=100, logreg__max_iter=150, logreg__penalty=l1 .........
[CV]  logreg__C=100, logreg__max_iter=150, logreg__penalty=l1, total=   0.3s
[CV] logreg__C=100, logreg__max_ite

[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   34.9s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'logreg__penalty': ['l1', 'l2'], 'logreg__C': [1.0, 10, 100], 'logreg__max_iter': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=2)

In [90]:
gs_logreg.score(X_train, y_train)

0.99432835820895527

In [91]:
gs_logreg.score(X_test, y_test)

0.75272727272727269