# Exercise 9

## Mashable news stories analysis

Predicting if a news story is going to be popular


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

import warnings
warnings.filterwarnings("ignore")

In [2]:
url = 'https://raw.githubusercontent.com/albahnsen/PracticalMachineLearningClass/master/datasets/mashable.csv'
df = pd.read_csv(url, index_col=0)
df.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,Popular
0,http://mashable.com/2014/12/10/cia-torture-rep...,28.0,9.0,188.0,0.73262,1.0,0.844262,5.0,1.0,1.0,...,0.2,0.8,-0.4875,-0.6,-0.25,0.9,0.8,0.4,0.8,1
1,http://mashable.com/2013/10/18/bitlock-kicksta...,447.0,7.0,297.0,0.653199,1.0,0.815789,9.0,4.0,1.0,...,0.16,0.5,-0.13534,-0.4,-0.05,0.1,-0.1,0.4,0.1,0
2,http://mashable.com/2013/07/24/google-glass-po...,533.0,11.0,181.0,0.660377,1.0,0.775701,4.0,3.0,1.0,...,0.136364,1.0,0.0,0.0,0.0,0.3,1.0,0.2,1.0,0
3,http://mashable.com/2013/11/21/these-are-the-m...,413.0,12.0,781.0,0.497409,1.0,0.67735,10.0,3.0,1.0,...,0.1,1.0,-0.195701,-0.4,-0.071429,0.0,0.0,0.5,0.0,0
4,http://mashable.com/2014/02/11/parking-ticket-...,331.0,8.0,177.0,0.685714,1.0,0.830357,3.0,2.0,1.0,...,0.1,0.55,-0.175,-0.25,-0.1,0.0,0.0,0.5,0.0,0


In [3]:
df.shape

(6000, 61)

In [4]:
X = df.drop(['url', 'Popular'], axis=1)
y = df['Popular']
y = y.astype('category')

scaler = StandardScaler()

scaler.fit(X)
X = scaler.transform(X)

In [5]:
y_ = pd.DataFrame(y)
popular = pd.crosstab(index=y_['Popular'], columns='count')
popular/popular.sum()

col_0,count
Popular,Unnamed: 1_level_1
1,0.5
0,0.5


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7351)

In [7]:
X_train = pd.DataFrame(X_train)

# Exercise 9.1

Estimate a Decision Tree Classifier and a Logistic Regresion

Evaluate using the following metrics:
* Accuracy
* F1-Score

In [8]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier()

logreg.fit(X_train,y_train)
tree.fit(X_train,y_train)

y_logreg_pred = logreg.predict(X_test)
y_tree_pred = tree.predict(X_test)

In [9]:
print(f"""LogReg
F1 Score: {round(f1_score(y_test,y_logreg_pred),3)}
Accuracy Score : {round(accuracy_score(y_test,y_logreg_pred),3)}""")
print(' ')
print(f"""Tree
F1 Score: {round(f1_score(y_test,y_tree_pred),3)}
Accuracy Score : {round(accuracy_score(y_test,y_tree_pred),3)}""")

LogReg
F1 Score: 0.631
Accuracy Score : 0.641
 
Tree
F1 Score: 0.56
Accuracy Score : 0.554


# Exercise 9.2

Estimate 300 bagged samples

Estimate the following set of classifiers:

* 100 Decision Trees where max_depth=None
* 100 Decision Trees where max_depth=2
* 100 Logistic Regressions

In [10]:
modelos = {
    'DTree_None' : DecisionTreeClassifier(max_depth=None),
    'DTree_2' : DecisionTreeClassifier(max_depth=2),
    'LogReg' : LogisticRegression()
}
y_pred = pd.DataFrame(columns = modelos.keys(), index = y_test.index)
for model in modelos.keys():
    breg = BaggingClassifier(modelos[model], n_estimators=100, n_jobs=-1,
                            bootstrap=True, oob_score=True, random_state=7351)
    breg.fit(X_train, y_train)
    y_pred[model] = breg.predict(X_test)

In [11]:
y_pred.head()

Unnamed: 0,DTree_None,DTree_2,LogReg
4078,0,0,0
340,0,1,1
1984,0,1,1
3819,1,0,0
3983,0,1,0


# Exercise 9.3

Ensemble using majority voting

Evaluate using the following metrics:
* Accuracy
* F1-Score

In [12]:
for model in modelos.keys():
    print(f'''
    {model}
    F1 Score: {round(f1_score(y_test,y_pred[model]),3)}
    Accuracy Score : {round(accuracy_score(y_test,y_pred[model]),3)}
    ''')

print(f'''Para el promedio de los tres:
F1 Score: {round(metrics.f1_score(y_pred.mean(axis=1).astype(int), y_test),3)}
Accuracy Score : {round(metrics.accuracy_score(y_pred.mean(axis=1).astype(int), y_test),3)}
''')


    DTree_None
    F1 Score: 0.648
    Accuracy Score : 0.65
    

    DTree_2
    F1 Score: 0.633
    Accuracy Score : 0.635
    

    LogReg
    F1 Score: 0.633
    Accuracy Score : 0.643
    
Para el promedio de los tres:
F1 Score: 0.534
Accuracy Score : 0.617



In [None]:
# Usando VotingClassifier de SKLearn

In [13]:
vclass = VotingClassifier(estimators=modelos.items(),
                         n_jobs=-1)
vclass.fit(X_train,y_train)

VotingClassifier(estimators=dict_items([('DTree_None', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_w...y='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
         flatten_transform=None, n_jobs=-1, voting='hard', weights=None)

In [14]:
print(f"""Con el algoritmo de SKLearn
F1 Score: {round(f1_score(y_test,vclass.predict(X_test)),3)}
Accuracy Score : {round(accuracy_score(y_test,vclass.predict(X_test)),3)}""")

Con el algoritmo de SKLearn
F1 Score: 0.635
Accuracy Score : 0.638


# Exercise 9.4

Estimate te probability as %models that predict positive

Modify the probability threshold and select the one that maximizes the F1-Score

In [15]:
y_ = pd.DataFrame(y)
popular = pd.crosstab(index=y_['Popular'], columns='count')
popular/popular.sum()

col_0,count
Popular,Unnamed: 1_level_1
1,0.5
0,0.5


In [16]:
f1 = []
y_pred = pd.DataFrame(index=y_test.index, columns=modelos.keys())
for k in np.arange(0.09,1,0.1):
    for model in modelos.keys():
        breg = BaggingClassifier(modelos[model], n_estimators=100, n_jobs=-1,
                                bootstrap=True,oob_score=True, random_state=7351)
        breg.fit(X_train,y_train)
        y_pred[model] = (breg.predict_proba(X_test)[:,1] >= k).astype(bool)
        f1.append([model,k,round(f1_score(y_test,y_pred[model]),3),round(metrics.accuracy_score(y_pred[model],y_test))])
f1 = pd.DataFrame(f1, columns=['Modelo','Prob','F1_Score','Accuracy_Score'])      

In [17]:
print(f'''
Mejor F1 Score DTree_None:
{f1.loc[f1[f1.Modelo=='DTree_None']['F1_Score'].idxmax()]}

Mejor F1 Score DTree_2:
{f1.loc[f1[f1.Modelo=='DTree_2']['F1_Score'].idxmax()]}

Mejor F1 Score LogReg:
{f1.loc[f1[f1.Modelo=='LogReg']['F1_Score'].idxmax()]}
''')


Mejor F1 Score DTree_None:
Modelo            DTree_None
Prob                    0.29
F1_Score               0.714
Accuracy_Score             1
Name: 6, dtype: object

Mejor F1 Score DTree_2:
Modelo            DTree_2
Prob                 0.29
F1_Score            0.699
Accuracy_Score          1
Name: 7, dtype: object

Mejor F1 Score LogReg:
Modelo            LogReg
Prob                0.39
F1_Score           0.708
Accuracy_Score         1
Name: 11, dtype: object



In [None]:
# Usando VotingClassifier de SKLearn

In [18]:
f1 = []
vclass = VotingClassifier(estimators=modelos.items(),
                          voting='soft',
                         n_jobs=-1)
vclass.fit(X_train,y_train)
for k in np.arange(0.09,1,0.1):
    y_pred = (vclass.predict_proba(X_test)[:,1] >= k).astype(bool)
    f1.append([k,round(f1_score(y_test,y_pred),3),round(vclass.score(X_test,y_test),3)])
f1 = pd.DataFrame(f1, columns=['Prob','F1_Score','Accuracy_Score'])
f1.loc[f1['F1_Score'].idxmax()]

Prob              0.190
F1_Score          0.695
Accuracy_Score    0.568
Name: 1, dtype: float64

# Exercise 9.5

Ensemble using weighted voting using the oob_error

Evaluate using the following metrics:
* Accuracy
* F1-Score

In [19]:
modelos_2 = {}
f1 = []
y_pred = pd.DataFrame(index=y_test.index, columns=modelos.keys())
for model in modelos.keys():
    modelos_2[model] = BaggingClassifier(modelos[model], n_estimators=100,n_jobs=-1,bootstrap=True, oob_score=True, random_state=7351)
    modelos_2[model].fit(X_train, y_train)
    y_pred[model] = modelos_2[model].predict(X_test)
    f1.append([model, metrics.f1_score(y_pred[model], y_test), metrics.accuracy_score(y_pred[model], y_test)])
f1=pd.DataFrame(f1, columns=['Modelo', 'F1_Score', 'Accuracy_Score',])

In [20]:
f1_=[]
y_pred = pd.DataFrame(index=y_test.index, columns=modelos_2.keys())
for model in modelos_2.keys():
    errors = np.zeros(modelos_2[model].n_estimators)
    y_pred_all_ = np.zeros((X_test.shape[0], modelos_2[model].n_estimators))
    for i in range(modelos_2[model].n_estimators):
        oob_sample = ~modelos_2[model].estimators_samples_[i]
        y_pred_ = modelos_2[model].estimators_[i].predict(X_train.values[oob_sample])
        errors[i] = metrics.accuracy_score(y_pred_, y_train.values[oob_sample])
        y_pred_all_[:, i] = modelos_2[model].estimators_[i].predict(X_test)
    alpha = (1 - errors) / (1 - errors).sum()
    y_pred[model] = (np.sum(y_pred_all_ * alpha, axis=1) >= 0.5).astype(np.int)
    f1_.append([model, metrics.f1_score(y_pred[model], y_test), metrics.accuracy_score(y_pred[model], y_test)])
f1_=pd.DataFrame(f1_, columns=['Modelo', 'F1_Score', 'Accuracy_Score',])

In [21]:
print(f'''Best F1 Score:
{f1_.loc[f1_['F1_Score'].idxmax()]}''')

Best F1 Score:
Modelo            DTree_None
F1_Score            0.650699
Accuracy_Score          0.65
Name: 0, dtype: object


In [None]:
# Usando VotingClassifier de SKLearn

In [22]:
y_pred_mean = []
for i in modelos_2.keys():
    errors = np.zeros(modelos_2[i].n_estimators)
    y_pred_all_ = np.zeros((X_test.shape[0], modelos_2[i].n_estimators))
    for l in range(modelos_2[i].n_estimators):
        oob_sample = ~modelos_2[i].estimators_samples_[l]
        y_pred_ = modelos_2[i].estimators_[l].predict(pd.DataFrame(X_train).values[oob_sample])
        errors[l] = metrics.accuracy_score(y_pred_, y_train.values[oob_sample])
        y_pred_all_[:, l] = modelos_2[i].estimators_[l].predict(X_test)
    alpha = (1 - errors) / (1 - errors).sum()
    y_pred = (np.sum(y_pred_all_ * alpha, axis=1) >= 0.5).astype(np.int)
    y_pred_mean.append([i, y_pred.mean()])

In [23]:
y_pred_mean = pd.DataFrame(y_pred_mean)
y_pred_mean[1].tolist()

[0.4806666666666667, 0.472, 0.45266666666666666]

In [24]:
f1 = []
vclass = VotingClassifier(estimators=modelos_2.items(),
                          voting='soft',
                         n_jobs=-1,
                          weights=y_pred_mean[1].tolist())
vclass.fit(X_train,y_train)
y_pred = (vclass.predict_proba(X_test)[:,1] >= 0.5).astype(bool)
f1.append([round(f1_score(y_test,y_pred),3),round(vclass.score(X_test,y_test),3)])
f1 = pd.DataFrame(f1, columns=['F1_Score','Accuracy_Score'])
f1.loc[f1['F1_Score'].idxmax()]

F1_Score          0.664
Accuracy_Score    0.665
Name: 0, dtype: float64

# Exercise 9.6

Estimate te probability of the weighted voting

Modify the probability threshold and select the one that maximizes the F1-Score

In [25]:
f1_=[]
y_pred = pd.DataFrame(index=y_test.index, columns=modelos_2.keys())
for k in np.arange(0.09,1,0.1):
    for model in modelos_2.keys():
        errors = np.zeros(modelos_2[model].n_estimators)
        y_pred_all_ = np.zeros((X_test.shape[0], modelos_2[model].n_estimators))
        for i in range(modelos_2[model].n_estimators):
            oob_sample = ~modelos_2[model].estimators_samples_[i]
            y_pred_ = modelos_2[model].estimators_[i].predict(X_train.values[oob_sample])
            errors[i] = metrics.accuracy_score(y_pred_, y_train.values[oob_sample])
            y_pred_all_[:, i] = modelos_2[model].estimators_[i].predict(X_test)
        alpha = (1 - errors) / (1 - errors).sum()
        y_pred[model] = (np.sum(y_pred_all_ * alpha, axis=1) >= k).astype(np.int)
        f1_.append([model,k, metrics.f1_score(y_pred[model], y_test), metrics.accuracy_score(y_pred[model], y_test)])
f1_=pd.DataFrame(f1_, columns=['Modelo','Prob', 'F1_Score', 'Accuracy_Score',])

In [26]:
print(f'''Best F1 Score:
{f1_.loc[f1_['F1_Score'].idxmax()]}''')

Best F1 Score:
Modelo            DTree_None
Prob                    0.29
F1_Score            0.712823
Accuracy_Score      0.607333
Name: 6, dtype: object


In [None]:
# Usando VotingClassifier de SKLearn

In [27]:
f1 = []
vclass = VotingClassifier(estimators=modelos_2.items(),
                          voting='soft',
                         n_jobs=-1,
                          weights=y_pred_mean[1].tolist())
vclass.fit(X_train,y_train)
for k in np.arange(0.09,1,0.1):
    y_pred = (vclass.predict_proba(X_test)[:,1] >= k).astype(bool)
    f1.append([k, round(f1_score(y_test,y_pred),3),round(vclass.score(X_test,y_test),3)])
f1 = pd.DataFrame(f1, columns=['Prob','F1_Score','Accuracy_Score'])
f1.loc[f1['F1_Score'].idxmax()]

Prob              0.390
F1_Score          0.714
Accuracy_Score    0.665
Name: 3, dtype: float64

# Exercise 9.7

Estimate a logistic regression using as input the estimated classifiers

Modify the probability threshold such that maximizes the F1-Score

In [28]:
f1=[]
y_pred = pd.DataFrame(index=y_test.index, columns=modelos_2.keys())
for model in modelos_2.keys():
    X_test_ = np.zeros((X_test.shape[0], modelos_2[model].n_estimators))
    X_train_ = np.zeros((X_train.shape[0], modelos_2[model].n_estimators))
    for i in range(modelos_2[model].n_estimators):
        X_train_[:, i] = modelos_2[model].estimators_[i].predict(X_train)
        X_test_[:, i] = modelos_2[model].estimators_[i].predict(X_test)
    logregcv = LogisticRegressionCV(cv=5)
    logregcv.fit(X_train_, y_train)
    y_pred[model] = logregcv.predict(X_test_)
    f1.append([model, metrics.f1_score(y_pred[model], y_test), metrics.accuracy_score(y_pred[model], y_test)])
f1=pd.DataFrame(f1, columns=['Model','F1_Score','Accuracy_Score'])

In [29]:
print(f'''Best F1 Score:
{f1.loc[f1['F1_Score'].idxmax()]}''')

Best F1 Score:
Model             DTree_None
F1_Score            0.647887
Accuracy_Score          0.65
Name: 0, dtype: object


In [None]:
# Usando VotingClassifier de SKLearn

In [34]:
X_train_2 = (vclass.predict_proba(X_train)[:,1] >= 0.390).astype(bool)
X_test_2 = (vclass.predict_proba(X_test)[:,1] >= 0.390).astype(bool)

In [37]:
logregcv.fit(X_train_2.reshape(-1,1),y_train)
print(f'''
F1 Score: {round(f1_score(logregcv.predict(X_test_2.reshape(-1,1)),y_test),3)}
Accuracy Score: {round(metrics.accuracy_score(logregcv.predict(X_test_2.reshape(-1,1)),y_test),3)}
''')


F1 Score: 0.714
Accuracy Score: 0.65

