In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import os
import scipy
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.ensemble import *
from sklearn.preprocessing import MinMaxScaler, LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import *
import hyperopt
from hyperopt import *
from hyperopt import fmin, tpe, hp, space_eval
import matplotlib.pyplot as plt
%matplotlib inline 


<div class="alert alert-block alert-info">
<b>oading dataset:</b> We load our dataset here 
</div>

In [2]:
input_file_path = r'D:\kaggle_trials\stumbleupon\train.tsv'
df              = pd.read_csv(input_file_path,sep = '\t')
df.head(2)

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,1,24,0,5424,170,8,0.152941,0.07913,0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,1,1,40,0,4973,187,9,0.181818,0.125448,1


<div class="alert alert-block alert-info">
<b>Preprocessing data :</b> We handle missing data, one hot encode categorical data and then finally scale numerical data.
</div>

In [3]:
cols_needed          = df.columns
df                   = df[cols_needed].replace('?', np.nan)
irrelevant_columns   = ['framebased','urlid','url','boilerplate']
cols_needed          = list(set(cols_needed)-set(irrelevant_columns))
cols_needed_features = list(cols_needed[:len(cols_needed)-1])
cols_needed_labels   = cols_needed[-1]
cols_encoding_needed = ['alchemy_category','hasDomainLink','is_news','lengthyLinkDomain','news_front_page']
cols_scaling_needed  = list(set(cols_needed)-set(cols_encoding_needed))
for i in range(len(cols_encoding_needed)):
    df[cols_encoding_needed[i]] = df[cols_encoding_needed[i]].fillna(df[cols_encoding_needed[i]].mode()[0])
mean_impute_dict    ={}
for i in range(len(cols_scaling_needed)):
    mean_impute_dict[cols_scaling_needed[i]] = np.nanmean(np.float_(df[cols_scaling_needed[i]].values))
for i in range(len(cols_scaling_needed)):
    df[cols_scaling_needed[i]]   = df[cols_scaling_needed[i]].fillna(mean_impute_dict[cols_scaling_needed[i]])
ohe                  = OneHotEncoder()
scalar               = MinMaxScaler()
encoded_matrix       = ohe.fit_transform(df[cols_encoding_needed])
scaled_matrix        = scalar.fit_transform(df[cols_scaling_needed])
X_complete_matrix    = scipy.sparse.hstack((encoded_matrix,scaled_matrix)).A
Y                    = df['label'].values

<div class="alert alert-block alert-info">
<b>Train Test split:</b> We perform train test split on the data
</div>

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_complete_matrix, Y, test_size=0.33, random_state=42)

<div class="alert alert-block alert-info">
<b>Hyper parameter grid creation :</b> We perform parameter tuning by creating a grid of hyper parameters
</div>

In [5]:
bagging_grid = {'n_estimators' : hp.choice('n_estimators',range(5,20)),
                'max_features' : hp.uniform('max_features',0.1,0.95),
                'bootstrap'    : hp.choice('bootstrap',[True,False])
               }

In [6]:
def hyperopt_train_test(params):
    clf = BaggingClassifier(**params)
    return cross_val_score(clf, X_train, y_train).mean()

def function_to_minimise(params):
    accuracy = hyperopt_train_test(params)
    return {'loss': -1*accuracy, 'status': STATUS_OK}


trials          = Trials()
best            = fmin(function_to_minimise, bagging_grid, algo=tpe.suggest, max_evals=30, trials=trials)
best_parameters = space_eval(bagging_grid, best)
print('The best parameter tuned on training set is given by :- ',best_parameters)

100%|█████████████████████████████████████████████████████████████████| 30/30 [00:12<00:00,  2.82it/s, best loss: -1.0]
The best parameter tuned on training set is given by :-  {'bootstrap': False, 'max_features': 0.8755934908566991, 'n_estimators': 10}


<div class="alert alert-block alert-info">
<b>Model Fitting and analysis:</b> We fit the model using the tuned parameters and then present a classification report as analysis
</div>

In [7]:
model = BaggingClassifier(**best_parameters)
model.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=False,
                  bootstrap_features=False, max_features=0.8755934908566991,
                  max_samples=1.0, n_estimators=10, n_jobs=None,
                  oob_score=False, random_state=None, verbose=0,
                  warm_start=False)

In [8]:
y_pred = model.predict(X_test)

In [9]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1198
           1       1.00      1.00      1.00      1243

    accuracy                           1.00      2441
   macro avg       1.00      1.00      1.00      2441
weighted avg       1.00      1.00      1.00      2441

