* Dataset: https://www.kaggle.com/amananandrai/clickbait-dataset

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk

import matplotlib.pyplot as plt

In [2]:
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sohal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sohal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Reading the dataset

In [3]:
click_data_actual = pd.read_csv("Data/clickbait_data.csv")

In [4]:
click_data_actual.head()

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


In [5]:
click_data_actual.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headline   32000 non-null  object
 1   clickbait  32000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 500.1+ KB


In [6]:
data_clickbait = click_data_actual[click_data_actual['clickbait'] == 1].head(8000)
data_no_clickbait = click_data_actual[click_data_actual['clickbait'] == 0].head(8000)

In [7]:
click_data = pd.concat([data_clickbait, data_no_clickbait])
click_data = click_data.reset_index(drop=True)
click_data.tail()

Unnamed: 0,headline,clickbait
15995,"A Good Run for Mutual Funds, but Questions Remain",0
15996,"Circus elephant escapes in Zurich, Switzerland",0
15997,"Bush backtracks over legal status of alleged ""...",0
15998,Synchronised bombings strike Indian state of A...,0
15999,U.S. Agencies Advised to Release Information t...,0


### Exploring the dataset

In [8]:
# Shape of the data
click_data.shape

(16000, 2)

In [9]:
# How many records are clickbait and how many are not?
click_data['clickbait'].value_counts()

1    8000
0    8000
Name: clickbait, dtype: int64

In [10]:
# Find missing data
click_data.isnull().sum()

headline     0
clickbait    0
dtype: int64

### Removing punctuation

In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [12]:
def remove_punctuation(text):
    text_no_punctuation = "".join([char for char in text if char not in string.punctuation])
    return text_no_punctuation

In [13]:
click_data['headline_no_punctuation'] = click_data['headline'].apply(lambda x: remove_punctuation(x))

#click_data.head(10)

### Tokenization

In [14]:
# The function tokenizes the text by sliptting the text at places where it finds a non word/ non numeric character
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [15]:
click_data['headline_tokenized'] = click_data['headline_no_punctuation'].apply(lambda x: tokenize(x.lower()))

#click_data.head(10)

### Removing stopwords

In [16]:
stopwords = nltk.corpus.stopwords.words("english")

In [17]:
len(stopwords)

179

In [18]:
def remove_stopwords(tokenized_lst):
    text = [word for word in tokenized_lst if word not in stopwords]
    return text

In [19]:
click_data["headline_no_stopwords"] = click_data["headline_tokenized"].apply(lambda x: remove_stopwords(x))

#click_data.head()

### Stem text

In [20]:
# Initialize PorterStemmer
# ps = nltk.PorterStemmer()

In [21]:
# def stemming(tokenized_text):
#    stem_text = " ".join(ps.stem(word) for word in tokenized_text)
#    return stem_text

In [22]:
#click_data['headline_stemmed'] = click_data['headline_no_stopwords'].apply(lambda x: stemming(x))

#click_data.head()

### Lemmatize text

In [23]:
# Initialize lemmatizer
wn = nltk.WordNetLemmatizer()

In [24]:
def lemmatizing(tokenized_text):
    lemmatized_text = " ".join(wn.lemmatize(word) for word in tokenized_text)
    return lemmatized_text

In [25]:
click_data['headline_lemmatized'] = click_data['headline_no_stopwords'].apply(lambda x: lemmatizing(x))

#click_data.head()

In [26]:
click_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   headline                 16000 non-null  object
 1   clickbait                16000 non-null  int64 
 2   headline_no_punctuation  16000 non-null  object
 3   headline_tokenized       16000 non-null  object
 4   headline_no_stopwords    16000 non-null  object
 5   headline_lemmatized      16000 non-null  object
dtypes: int64(1), object(5)
memory usage: 750.1+ KB


***

## <center>Vectorization</center>

### Count Vectorization

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
count_vect = CountVectorizer()

In [29]:
data_1 = click_data.copy()
data_1.head()

Unnamed: 0,headline,clickbait,headline_no_punctuation,headline_tokenized,headline_no_stopwords,headline_lemmatized
0,Should I Get Bings,1,Should I Get Bings,"[should, i, get, bings]","[get, bings]",get bings
1,Which TV Female Friend Group Do You Belong In,1,Which TV Female Friend Group Do You Belong In,"[which, tv, female, friend, group, do, you, be...","[tv, female, friend, group, belong]",tv female friend group belong
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1,The New Star Wars The Force Awakens Trailer Is...,"[the, new, star, wars, the, force, awakens, tr...","[new, star, wars, force, awakens, trailer, giv...",new star war force awakens trailer give chill
3,"This Vine Of New York On ""Celebrity Big Brothe...",1,This Vine Of New York On Celebrity Big Brother...,"[this, vine, of, new, york, on, celebrity, big...","[vine, new, york, celebrity, big, brother, fuc...",vine new york celebrity big brother fucking pe...
4,A Couple Did A Stunning Photo Shoot With Their...,1,A Couple Did A Stunning Photo Shoot With Their...,"[a, couple, did, a, stunning, photo, shoot, wi...","[couple, stunning, photo, shoot, baby, learnin...",couple stunning photo shoot baby learning inop...


In [30]:
x_count = count_vect.fit_transform(data_1['headline_lemmatized'])

In [31]:
x_count.shape

(16000, 15389)

In [32]:
count_vect.get_feature_names()



['000',
 '00s',
 '01',
 '02',
 '04',
 '08',
 '09',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '10000copy',
 '10000th',
 '100k',
 '100th',
 '100yearold',
 '100ºf',
 '101',
 '103',
 '104',
 '105',
 '106',
 '10600',
 '108',
 '109',
 '109th',
 '10hour',
 '10minute',
 '10second',
 '10th',
 '10year',
 '10yearold',
 '11',
 '1100',
 '111',
 '113',
 '115',
 '117',
 '119',
 '11k',
 '11th',
 '11yearolds',
 '12',
 '1200',
 '12000',
 '120000',
 '1200mile',
 '121',
 '1215',
 '126292pound',
 '12th',
 '12yearold',
 '13',
 '130',
 '13000',
 '133',
 '134',
 '13500',
 '136',
 '137',
 '139',
 '139b',
 '13yearold',
 '14',
 '140',
 '14000',
 '140000',
 '14000acre',
 '1419',
 '144',
 '146',
 '147',
 '149',
 '14yearold',
 '15',
 '150',
 '15000',
 '153',
 '153000',
 '154',
 '159',
 '15m',
 '15second',
 '15yearold',
 '16',
 '160',
 '163',
 '164',
 '165',
 '168',
 '16death',
 '16hour',
 '16th',
 '16year',
 '16yearold',
 '17',
 '1700',
 '172',
 '175',
 '17point',
 '17th',
 '17year',
 '17yearold',
 '18',
 '180'

In [33]:
x_count_df = pd.DataFrame(x_count.toarray())
x_count_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15379,15380,15381,15382,15383,15384,15385,15386,15387,15388
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
x_count_df.columns = count_vect.get_feature_names()



In [35]:
x_count_df.head()

Unnamed: 0,000,00s,01,02,04,08,09,10,100,1000,...,zoom,zoombak,zotob,zuckerberg,zuma,zurich,zykina,ángel,íngrid,ürümqi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Count Vectorizer (N-gram)

In [36]:
data_2 = click_data.copy()
data_2.head()

Unnamed: 0,headline,clickbait,headline_no_punctuation,headline_tokenized,headline_no_stopwords,headline_lemmatized
0,Should I Get Bings,1,Should I Get Bings,"[should, i, get, bings]","[get, bings]",get bings
1,Which TV Female Friend Group Do You Belong In,1,Which TV Female Friend Group Do You Belong In,"[which, tv, female, friend, group, do, you, be...","[tv, female, friend, group, belong]",tv female friend group belong
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1,The New Star Wars The Force Awakens Trailer Is...,"[the, new, star, wars, the, force, awakens, tr...","[new, star, wars, force, awakens, trailer, giv...",new star war force awakens trailer give chill
3,"This Vine Of New York On ""Celebrity Big Brothe...",1,This Vine Of New York On Celebrity Big Brother...,"[this, vine, of, new, york, on, celebrity, big...","[vine, new, york, celebrity, big, brother, fuc...",vine new york celebrity big brother fucking pe...
4,A Couple Did A Stunning Photo Shoot With Their...,1,A Couple Did A Stunning Photo Shoot With Their...,"[a, couple, did, a, stunning, photo, shoot, wi...","[couple, stunning, photo, shoot, baby, learnin...",couple stunning photo shoot baby learning inop...


In [37]:
# range is unigram, bigram, trigram and so on
ngram_vect = CountVectorizer(ngram_range=(2,2))

In [38]:
x_count_ngram = ngram_vect.fit_transform(data_2['headline_lemmatized'])

In [39]:
x_count_ngram.shape

(16000, 67687)

In [40]:
ngram_vect.get_feature_names()



['000 run',
 '00s bad',
 '00s british',
 '00s disney',
 '00s everyone',
 '00s forgot',
 '00s girl',
 '00s hit',
 '00s kid',
 '00s movie',
 '00s music',
 '00s pop',
 '00s still',
 '00s teen',
 '00s youll',
 '01 percent',
 '02 last',
 '04 last',
 '08 matchday',
 '09 forssell',
 '09 goalkeeper',
 '09 inter',
 '09 wolfsburg',
 '10 15',
 '10 2008',
 '10 amazing',
 '10 arrested',
 '10 authority',
 '10 awesome',
 '10 awkward',
 '10 basic',
 '10 billionth',
 '10 bold',
 '10 celebs',
 '10 centimetre',
 '10 cheating',
 '10 christmas',
 '10 constellation',
 '10 day',
 '10 delicious',
 '10 excuse',
 '10 food',
 '10 friend',
 '10 ghost',
 '10 give',
 '10 gogo',
 '10 google',
 '10 hairstyle',
 '10 injured',
 '10 jackolanterns',
 '10 le',
 '10 leongatha',
 '10 lifechanging',
 '10 men',
 '10 mesmerizing',
 '10 million',
 '10 millionth',
 '10 minute',
 '10 month',
 '10 mouthwatering',
 '10 movie',
 '10 much',
 '10 mundane',
 '10 pair',
 '10 people',
 '10 perk',
 '10 popular',
 '10 question',
 '10 rare'

In [41]:
x_count_ngram_df = pd.DataFrame(x_count_ngram.toarray(), columns = ngram_vect.get_feature_names())
x_count_ngram_df.head()

Unnamed: 0,000 run,00s bad,00s british,00s disney,00s everyone,00s forgot,00s girl,00s hit,00s kid,00s movie,...,zuckerberg answered,zuckerberg victim,zuma charged,zuma chooses,zurich call,zurich switzerland,zykina dy,ángel cabrera,íngrid betancourt,ürümqi china
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Tfidf vectorizer

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
data_3 = click_data.copy()

In [44]:
tfidf__vect = TfidfVectorizer()
x_tfidf = tfidf__vect.fit_transform(data_3['headline_lemmatized'])

In [45]:
x_tfidf.shape

(16000, 15389)

In [46]:
x_tfidf_df = pd.DataFrame(x_tfidf.toarray(), columns = tfidf__vect.get_feature_names())
x_tfidf_df = x_tfidf_df.reset_index(drop=True)
x_tfidf_df.isnull().sum()



000       0
00s       0
01        0
02        0
04        0
         ..
zurich    0
zykina    0
ángel     0
íngrid    0
ürümqi    0
Length: 15389, dtype: int64

***

## <center>Feature Engineering</center>

In [47]:
#data_4 = click_data.iloc[:, :2]
#data_4.head()

### Counting the length of the text

In [48]:
# This feature is used to check if the length of clickbait headline is longer than that of normal headline
#data_4['headline_length'] = data_4['headline'].apply(lambda x: len(x) - x.count(" "))
#data_4.head()
x_tfidf_df['headline_length'] = click_data['headline'].apply(lambda x: len(x) - x.count(" "))
x_tfidf_df = x_tfidf_df.reset_index(drop=True)
x_tfidf_df.isnull().sum()

000                0
00s                0
01                 0
02                 0
04                 0
                  ..
zykina             0
ángel              0
íngrid             0
ürümqi             0
headline_length    0
Length: 15390, dtype: int64

### Calcultaing the percentage of the punctuation in the headline

In [49]:
def count_punctuation(text):
    count = sum(1 for char in text if char in string.punctuation)
    percent = np.round(count / (len(text) - text.count(" ")),2) * 100
    return percent

#data_4['punctuation%'] = data_4['headline'].apply(lambda x: count_punctuation(x))
#data_4.head()
x_tfidf_df['punctuation%'] = click_data['headline'].apply(lambda x: count_punctuation(x))
x_tfidf_df = x_tfidf_df.reset_index(drop=True)
x_tfidf_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000 entries, 0 to 15999
Columns: 15391 entries, 000 to punctuation%
dtypes: float64(15390), int64(1)
memory usage: 1.8 GB


### Counting the percent of uppercase letters in the headline

In [50]:
def count_uppercase_char(text):
    count_uppercase = sum(1 for char in text if char.isupper())
    percent_uppercase = np.round(count_uppercase / (len(text) - text.count(" ")),2) * 100
    return percent_uppercase

#data_4['Uppercase%'] = data_4['headline'].apply(lambda x: count_uppercase_char(x))
#data_4.head()
x_tfidf_df['uppercase%'] = click_data['headline'].apply(lambda x: count_uppercase_char(x))
x_tfidf_df.isnull().sum()

000                0
00s                0
01                 0
02                 0
04                 0
                  ..
íngrid             0
ürümqi             0
headline_length    0
punctuation%       0
uppercase%         0
Length: 15392, dtype: int64

### Count the number of question marks

In [51]:
def count_question_mark(text):
    count = sum(1 for char in text if char == "?")
    return count

#data_4['Question_mark_count'] = data_4['headline'].apply(lambda x: count_question_mark(x))
#data_4.iloc[14023]
x_tfidf_df['question_mark_count'] = click_data['headline'].apply(lambda x: count_question_mark(x))
x_tfidf_df.iloc[13758]

000                     0.0
00s                     0.0
01                      0.0
02                      0.0
04                      0.0
                       ... 
ürümqi                  0.0
headline_length        46.0
punctuation%            0.0
uppercase%              4.0
question_mark_count     0.0
Name: 13758, Length: 15393, dtype: float64

### Evaluating the features

In [52]:
# Plotting the headline length
#bins = np.linspace(0, 120, 40)
#plt.hist(data_4[data_4['clickbait']==1]['headline_length'], bins, alpha = 0.5, label = 'Clickbait')
#plt.hist(data_4[data_4['clickbait']==0]['headline_length'], bins, alpha = 0.5, label = 'Not a Clickbait')
#plt.legend()
#plt.show()

In [53]:
# Plotting the headline punctuation percent
#bins = np.linspace(0, 20, 40)
#plt.hist(data_4[data_4['clickbait']==1]['punctuation%'], bins, alpha = 0.5, label = 'Clickbait')
#plt.hist(data_4[data_4['clickbait']==0]['punctuation%'], bins, alpha = 0.5, label = 'Not a Clickbait')
#plt.legend()
#plt.show()

In [54]:
# Plotting the headline uppercase percent
#bins = np.linspace(0, 50, 40)
#plt.hist(data_4[data_4['clickbait']==1]['Uppercase%'], bins, alpha = 0.5, label = 'Clickbait')
#plt.hist(data_4[data_4['clickbait']==0]['Uppercase%'], bins, alpha = 0.5, label = 'Not a Clickbait')
#plt.legend()
#plt.show()

In [55]:
# Plotting the headline question mark count
#bins = np.linspace(0, 5, 40)
#plt.hist(data_4[data_4['clickbait']==1]['Question_mark_count'], bins, alpha = 0.5, label = 'Clickbait')
#plt.hist(data_4[data_4['clickbait']==0]['Question_mark_count'], bins, alpha = 0.5, label = 'Not a Clickbait')
#plt.legend()
#plt.show()

### Box-Cox power transformation

In [56]:
#for i in range(1, 6):
#    plt.hist((data_4['punctuation%'])**(1/i), bins = 40)
#    plt.title("Transformation: 1/{}".format(str(i)))
#    plt.show()

## <center>Random Forest Classifier through Cross-validation</center>

In [57]:
data_df = x_tfidf_df.copy()

In [58]:
data_df.head()

Unnamed: 0,000,00s,01,02,04,08,09,10,100,1000,...,zuma,zurich,zykina,ángel,íngrid,ürümqi,headline_length,punctuation%,uppercase%,question_mark_count
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,15,0.0,27.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,37,0.0,27.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,60,5.0,23.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,56,4.0,21.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,83,0.0,22.0,0


In [59]:
data_df.isnull().sum()

000                    0
00s                    0
01                     0
02                     0
04                     0
                      ..
ürümqi                 0
headline_length        0
punctuation%           0
uppercase%             0
question_mark_count    0
Length: 15393, dtype: int64

In [60]:
features = data_df
target = click_data['clickbait']

In [61]:
#from sklearn.model_selection import KFold, cross_val_score
#from sklearn.ensemble import RandomForestClassifier

In [62]:
# n_jobs is set to -1 so that multiple jobs can be executed in parallel
#rf = RandomForestClassifier(n_jobs = -1)

# KFold is used to divide the dataset into subsets such that it can divide the dataset into specified number of sets.
# The model is then trained on k-1 sets and 1 set is kept aside as a test set
#k_fold = KFold(n_splits=5)

#cross_val_score(rf, features, target, cv=k_fold, scoring='accuracy', n_jobs=-1)

## <center>Random Forest Classifier through holdout test set</center>

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score

In [64]:
features = features.astype(float, errors = 'raise')

In [65]:
X_train, X_test, Y_train, Y_test = train_test_split(features, target, test_size=0.2)

In [66]:
X_train.head()

Unnamed: 0,000,00s,01,02,04,08,09,10,100,1000,...,zuma,zurich,zykina,ángel,íngrid,ürümqi,headline_length,punctuation%,uppercase%,question_mark_count
13489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,56.0,0.0,20.0,0.0
10800,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,44.0,5.0,25.0,0.0
6078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,42.0,0.0,17.0,0.0
7723,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,53.0,0.0,13.0,0.0
285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,56.0,5.0,18.0,0.0


In [67]:
X_test.head()

Unnamed: 0,000,00s,01,02,04,08,09,10,100,1000,...,zuma,zurich,zykina,ángel,íngrid,ürümqi,headline_length,punctuation%,uppercase%,question_mark_count
13128,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,38.0,0.0,13.0,0.0
11313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,10.0,0.0
13902,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,67.0,0.0,3.0,0.0
15461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,42.0,2.0,14.0,0.0
2453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,58.0,2.0,17.0,0.0


In [68]:
# Initializing the model
rf = RandomForestClassifier(n_estimators=50)

# Training the model
rf_model = rf.fit(X_train, Y_train)

# Predict the values
y_pred = rf_model.predict(X_test)

# Calculate the scores
precision, recall, fscore, support = score(Y_test, y_pred)

In [69]:
accuracy = (y_pred == Y_test).sum()/len(y_pred)
print("Precision: {} \nRecall: {} \nFscore: {} \nSupport: {} \nAccuracy: {}". format(precision, recall, fscore, support, accuracy))

Precision: [0.97185864 0.94856459] 
Recall: [0.9452578  0.97360344] 
Fscore: [0.95837367 0.96092093] 
Support: [1571 1629] 
Accuracy: 0.9596875


## <center>Random forest model with grid search</center>

<b>Grid search:</b> Searchs a combination of parameters to determine the best model.

In [70]:
def train_rf(n_est, depth):
    # Initializing the model
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    
    # Training the model
    rf_model = rf.fit(X_train, Y_train)

    # Predict the values
    y_pred = rf_model.predict(X_test)

    # Calculate the scores
    precision, recall, fscore, support = score(Y_test, y_pred, average = 'binary')
    accuracy = (y_pred == Y_test).sum()/len(y_pred)
    return precision, recall, fscore, support, accuracy
    
    # Print scores
    # print("Est: {}, depth: {} ---> Precision: {} | Recall: {} | Fscore: {} | Support: {} | Accuracy: {}". format(n_est, depth, precision, recall, fscore, support, accuracy))

In [71]:
score_metrics = []
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        precision, recall, fscore, support, accuracy = train_rf(n_est, depth)
        score_metrics.append((n_est, depth, precision, recall, fscore, support, accuracy ))        

In [72]:
# Creating dataframe of the score metrics
score_df = pd.DataFrame(score_metrics, columns=['n_estimator', 'max_depth', 'Precision', 'Recall', 'Fscore', 'Support', 'Accuracy'])
score_df

Unnamed: 0,n_estimator,max_depth,Precision,Recall,Fscore,Support,Accuracy
0,10,10.0,0.935091,0.565991,0.705163,,0.759062
1,10,20.0,0.839827,0.952732,0.892724,,0.883437
2,10,30.0,0.843393,0.952118,0.894464,,0.885625
3,10,,0.941425,0.957029,0.949163,,0.947812
4,50,10.0,0.841167,0.955801,0.894828,,0.885625
5,50,20.0,0.874789,0.956415,0.913783,,0.908125
6,50,30.0,0.961159,0.896255,0.927573,,0.92875
7,50,,0.94967,0.97299,0.961189,,0.96
8,100,10.0,0.889082,0.93984,0.913757,,0.909687
9,100,20.0,0.9287,0.951504,0.939964,,0.938125


## <center>Random Forest with Grid search cross validation</center>

<b>Cross validation:</b> Divides the dataset into k number of sets and repeats the holdout mehod k number of times where a different set is heldout in each of the iteration

In [73]:
from sklearn.model_selection import GridSearchCV

In [74]:
%%time
# Initializing the model
rf = RandomForestClassifier()

# Setting the parameters for grid search
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90]}

# Intializing grid search cv
gs = GridSearchCV(rf, param, cv =3)
gs_fit = gs.fit(features, target)

# Create dataframe with ascending accuracy value (mean_test_score == accuracy)
gs_df = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)
gs_df.head()

Wall time: 13min 29s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
8,68.312747,0.185241,0.899937,0.004426,90,300,"{'max_depth': 90, 'n_estimators': 300}",0.950694,0.948059,0.95256,0.950437,0.001846,1
4,24.948575,0.24798,0.460771,0.00818,60,150,"{'max_depth': 60, 'n_estimators': 150}",0.952006,0.949559,0.945809,0.949125,0.002548,2
7,35.478035,0.96215,0.556126,0.006378,90,150,"{'max_depth': 90, 'n_estimators': 150}",0.94282,0.944872,0.949372,0.945688,0.002736,3
5,51.158588,1.169272,0.755502,0.032055,60,300,"{'max_depth': 60, 'n_estimators': 300}",0.940007,0.945059,0.942809,0.942625,0.002066,4
2,26.253279,0.127769,0.462774,0.001248,30,300,"{'max_depth': 30, 'n_estimators': 300}",0.942632,0.940371,0.934558,0.939187,0.003401,5


## <center>Gradient Boosting with grid search</center>

In [75]:
from sklearn.ensemble import GradientBoostingClassifier

In [76]:
def train_gb(n_est, depth, lr):
    # Initializing the model
    gb = GradientBoostingClassifier(n_estimators=n_est, max_depth=depth, learning_rate=lr)
    
    # Training the model
    gb_model = gb.fit(X_train, Y_train)

    # Predict the values
    y_pred = gb_model.predict(X_test)

    # Calculate the scores
    precision, recall, fscore, support = score(Y_test, y_pred, average = 'binary')
    accuracy = (y_pred == Y_test).sum()/len(y_pred)
    return precision, recall, fscore, support, accuracy

In [77]:
score_metrics = []
for n_est in [50,100]:
    for depth in [3,7]:
        for lr in [0.1,1]:
            precision, recall, fscore, support, accuracy = train_gb(n_est, depth,lr)
            score_metrics.append((n_est, depth, lr, precision, recall, fscore, support, accuracy ))        

In [78]:
# Creating dataframe of the score metrics
score_df = pd.DataFrame(score_metrics, columns=['n_estimator', 'max_depth', 'Learning Rate', 'Precision', 'Recall', 'Fscore', 'Support', 'Accuracy'])
score_df

Unnamed: 0,n_estimator,max_depth,Learning Rate,Precision,Recall,Fscore,Support,Accuracy
0,50,3,0.1,0.874556,0.907305,0.89063,,0.886563
1,50,3,1.0,0.924954,0.930632,0.927785,,0.92625
2,50,7,0.1,0.901679,0.923266,0.912345,,0.909687
3,50,7,1.0,0.919365,0.92388,0.921617,,0.92
4,100,3,0.1,0.88922,0.916513,0.90266,,0.899375
5,100,3,1.0,0.922937,0.933702,0.928288,,0.926562
6,100,7,0.1,0.918478,0.933702,0.926027,,0.924063
7,100,7,1.0,0.91764,0.916513,0.917076,,0.915625


## <center>Gradient boosting with Grid search cross validation</center>

In [80]:
%%time
# Initializing the model
gb = GradientBoostingClassifier()

# Setting the parameters for grid search
param = {'n_estimators': [100, 150],
        'max_depth': [3, 5],
        'learning_rate': [1]}

# Intializing grid search cv
gs = GridSearchCV(gb, param, cv =3)
gs_fit = gs.fit(features, target)

# Create dataframe with ascending accuracy value (mean_test_score == accuracy)
gs_df = pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)
gs_df.head()


Wall time: 42min 34s


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,119.490132,1.412438,0.722169,0.045733,1,3,100,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",0.929321,0.924995,0.92312,0.925812,0.002597,1
1,177.490209,0.414801,0.691157,0.019617,1,3,150,"{'learning_rate': 1, 'max_depth': 3, 'n_estima...",0.928384,0.926683,0.919932,0.925,0.00365,2
3,288.051843,0.496666,0.747085,0.038474,1,5,150,"{'learning_rate': 1, 'max_depth': 5, 'n_estima...",0.927822,0.923495,0.91862,0.923312,0.003759,3
2,192.511758,0.619679,0.724125,0.045716,1,5,100,"{'learning_rate': 1, 'max_depth': 5, 'n_estima...",0.924822,0.920308,0.913932,0.919687,0.004467,4
