In [2]:
import pandas as pd
import numpy as np

# 04 - Applied ML
## Question 1: Propensity score matching



In [1]:
#

## Question 2: Applied ML

### 1. Loading The 20 newsgroups text dataset


In [30]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

In [31]:
# We will print only the first rows
data_to_print = newsgroups_train.data[0:5]
target_to_print = newsgroups_train.target[0:5]

# Make a pandas dataframe from sklearn dataset
df = pd.DataFrame(np.c_[data_to_print, target_to_print], columns= np.append("Raw text", "Category"))
df.Category = df.Category.map(lambda cat_index: newsgroups_train.target_names[int(cat_index)])
df.head()

Unnamed: 0,Raw text,Category
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,sci.space


#### Compute TF-IDF features

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)

In [33]:
# We will print only the first rows
data_to_print = vectors.data[0:5]
target_to_print = newsgroups_train.target[0:5]

# Make a pandas dataframe from sklearn dataset
df = pd.DataFrame(np.c_[data_to_print, target_to_print], columns= np.append("TF-IDF", "Category"))
df.Category = df.Category.map(lambda cat_index: newsgroups_train.target_names[int(cat_index)])
df.head()

Unnamed: 0,TF-IDF,Category
0,0.05747,rec.autos
1,0.353835,comp.sys.mac.hardware
2,0.259709,comp.sys.mac.hardware
3,0.211868,comp.graphics
4,0.054614,sci.space


#### Split train and test/val set
We split the loaded data into 3 subsets : 
- Training set (80%)
- Test set (10%)
- Validation set (10%

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(vectors, newsgroups_train.target, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(vectors, newsgroups_train.target, test_size=0.5, random_state=42)

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Validation set shape:", X_val.shape)

Train set shape: (9051, 130107)
Test set shape: (5657, 130107)
Validation set shape: (5657, 130107)


### 2 Classification with Random forest

#### Using default hyperparameters : 


In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(n_jobs=2, random_state=0)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=2, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [36]:
preds = clf.predict(X_test) 

In [37]:
print("Test Accuracy:", accuracy_score(y_test, preds))
print("Validation Accuracy :", accuracy_score(y_val, clf.predict(X_val)))

Test Accuracy: 0.853809439632
Validation Accuracy : 0.998232278593


In [38]:
# https://gist.github.com/nickynicolson/202fe765c99af49acb20ea9f77b6255e
# Convert a sklearn confusion matrix output to an elegant dataframe
def cm2df(cm, labels):
    df = pd.DataFrame()
    # rows
    for i, row_label in enumerate(labels):
        rowdata={}
        # columns
        for j, col_label in enumerate(labels): 
            rowdata[col_label]=cm[i,j]
        df = df.append(pd.DataFrame.from_dict({row_label:rowdata}, orient='index'))
    return df[labels]


confusion_matrix_df = cm2df(confusion_matrix(y_test, preds), newsgroups_train.target_names)
confusion_matrix_df

Unnamed: 0,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
alt.atheism,205,1,1,0,0,0,0,1,0,1,1,1,0,3,0,11,1,1,1,5
comp.graphics,1,243,12,3,2,5,5,1,0,3,0,2,2,2,2,0,2,0,0,0
comp.os.ms-windows.misc,0,17,249,11,1,3,4,0,0,2,0,0,3,0,2,0,0,0,1,0
comp.sys.ibm.pc.hardware,2,19,14,236,7,4,10,1,0,1,0,0,5,1,0,1,0,0,0,0
comp.sys.mac.hardware,0,11,6,25,229,1,4,4,2,2,1,0,5,2,0,2,2,0,0,0
comp.windows.x,1,7,4,5,6,260,1,0,0,0,1,0,0,0,0,0,0,0,0,0
misc.forsale,1,5,3,6,3,1,250,6,2,0,1,0,2,1,0,0,0,0,0,0
rec.autos,4,5,3,6,5,2,8,261,5,5,0,1,3,1,0,1,1,1,0,0
rec.motorcycles,3,3,0,1,2,0,4,9,293,0,1,0,1,1,0,1,0,0,0,0
rec.sport.baseball,0,3,2,3,3,3,3,1,2,254,8,0,0,0,0,1,0,1,0,0


#### Grid search cross validation : 

In [39]:
from sklearn.model_selection import GridSearchCV

param_grid = { 
           "n_estimators" : [63, 65],
           "max_depth" : [30, 40]}
 
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 2)
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'max_depth': 40, 'n_estimators': 65}

In [40]:
print("Grid score :")
grid_search.grid_scores_

Grid score :




[mean: 0.75152, std: 0.00168, params: {'max_depth': 30, 'n_estimators': 63},
 mean: 0.75318, std: 0.00135, params: {'max_depth': 30, 'n_estimators': 65},
 mean: 0.75671, std: 0.00683, params: {'max_depth': 40, 'n_estimators': 63},
 mean: 0.75970, std: 0.00716, params: {'max_depth': 40, 'n_estimators': 65}]

In [41]:
"Best score", grid_search.best_score_

('Best score', 0.75969506131919129)

In [42]:
test = RandomForestClassifier(n_jobs=2, random_state=0, max_depth=30, n_estimators=63)
test.fit(X_train, y_train)
print("Validation Accuracy :", accuracy_score(y_val, test.predict(X_val)))

Validation Accuracy : 0.946968357787
