# 04 - Applied ML

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import networkx as nx

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import ShuffleSplit

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import metrics

%matplotlib inline

# Constants definition

In [None]:
DATA_PATH = "./data"

 # Question 1: Propensity score matching

In [None]:
df = pd.read_csv("{}/lalonde.csv".format(DATA_PATH))
df.set_index('id', drop=True, inplace=True)

In [None]:
df['outcome'] = np.where( (df['re78'] - df['re75']) > 0 , 1, 0)

In [None]:
df.head()

In [None]:
df[['educ', 'outcome', 'treat']].groupby(['educ', 'outcome']).count()

### 1. a naive approach

In [None]:
x = df[df["treat"] == 0]['re74']
y = df[df["treat"] == 0]['re78']

plt.hist( (x, y), label=('Before treatment', 'After treatment'))
plt.legend(loc='upper right')
plt.title("People NOT under treatment")
plt.show()

In [None]:
x = df[df["treat"] == 1]['re74']
y = df[df["treat"] == 1]['re78']

plt.hist( (x, y), label=('Before treatment', 'After treatment'))
plt.legend(loc='upper right')
plt.title("People under treatment")
plt.show()

### 3. A propensity score model

In the propensity score we want to obtain the probability to be under treatment according to the different covariates. In this case we will use the Logistic Regression Classifier to obtain a model for such task

In [None]:
logistic = LogisticRegression()

The differen covariates of this problem are the following:

In [None]:
features = ['age', 'educ', 'black', 'hispan', 'married', 'nodegree']

In [None]:
# The features vector
X = pd.get_dummies(df[features])
X.head()

We want to compute the probability of the treatment over the covariates. Our target will therefore be the treatment data

In [None]:
y = df['treat']

In [None]:
logistic.fit(X, y)

In [None]:
logistic.classes_

In [None]:
pred = logistic.predict_proba(X)

df['propensity score'] = pred[:, 1]

df.head()

### 4. Balancing the dataset via matching

In [None]:
G = nx.Graph()
G.add_nodes_from([1,2,3,4], bipartite=0) # Add the node attribute "bipartite"
G.add_nodes_from(['a','b','c'], bipartite=1)
G.add_edges_from([(1,'a'), (1,'b'), (2,'b'), (2,'c'), (3,'c'), (4,'a')])

In [None]:
treated = df[df['treat'] == 1]['propensity score']
control = df[df['treat'] == 0]['propensity score']

In [None]:
B = nx.Graph()

B.add_nodes_from(treated[:5], bipartite=0)
B.add_nodes_from(control[:5], bipartite=1)

In [None]:
plt.subplot(121)
nx.draw(B, with_labels=True, font_weight='regular')

In [None]:
from networkx.algorithms import bipartite
a, b = bipartite.sets(B)

In [None]:
a

In [None]:
nx.bipartite.maximum_matching(B)

 # Question 2: Applied ML

### 1)

In [None]:
newsgroups = fetch_20newsgroups(data_home='{}/'.format(DATA_PATH) ,subset='all')

In [None]:
print(newsgroups.data[1], end="\n\n")

In [None]:
type(newsgroups)

In [None]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups.data)
vectors.shape

### 2)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

In [None]:
newsgroups_test = fetch_20newsgroups(data_home='{}/'.format(DATA_PATH) ,subset='test')
vectors_test = vectorizer.transform(newsgroups_train.data)

clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)

metrics.f1_score(newsgroups_test.target, pred, average='macro')