# 1.Loading JSON dataset

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
import pandas as pd 
import numpy as np 
import math
import nltk
import json
import os
import re

def read_truth(path):
    data = []
    index = []
    for i, line in enumerate(open(path + '/truth.jsonl', 'r')):
        instance = json.loads(line)
        data.append(instance)
        index.append(instance['id'])
    df = pd.DataFrame(data=data, index=index)
    df.sort_index(inplace=True)
    return df

def read_instance(path):
    data = []
    index = []
    for i, line in enumerate(open(path + '/instances.jsonl', 'rb')):
        instance = json.loads(line)
        data.append(instance)
        index.append(instance['id'])
    df = pd.DataFrame(data=data, index=index)
    df.sort_index(inplace=True)
    return df


train_truth = read_truth('./clickbait17-train-170331')
train_instances = read_instance('./clickbait17-train-170331')
validation_truth = read_truth('./clickbait17-validation-170630')
validation_instances = read_instance('./clickbait17-validation-170630')



In [2]:
train = pd.merge(train_truth, train_instances, on = 'id')
validation = pd.merge(validation_truth, validation_instances, on = 'id')
data = pd.concat([train, validation],ignore_index = True)
data['truthClass'] = data['truthClass'].map({'clickbait':True ,'no-clickbait':False}).astype(bool)

# 2.Basic Feature Extraction

## 2.1 number of words

+ Number of Words:  

   title    
   body    
   caption 
   

+ Number of character: 

   title    
   body    
   caption 
   
   
+ average word per sentence: 

   body 

In [3]:
def avg_word(sentence):
    sentence = str(sentence)
    words = sentence.split()
    return round((sum(len(word) for word in words)/len(words)), 1)


char_title = []
word_title = []
avg_paragraph = []
char_paragraph = []
word_paragraph = []
char_caption = []
word_caption = []

for title in data['targetTitle']:
    char = len(str(title))
    word = len(str(title).split(' '))
    char_title.append(char)
    word_title.append(word)
    
for paragraph in data['targetParagraphs']:
    char = len(str(paragraph))
    word = len(str(paragraph).split(' '))
    avg = avg_word(paragraph)
    avg_paragraph.append(avg)
    char_paragraph.append(char)
    word_paragraph.append(word)
    
for caption in data['targetCaptions']:
    char = len(str(caption))
    word = len(str(caption).split(' '))
    char_caption.append(char)
    word_caption.append(word)

In [4]:
char_feature = pd.DataFrame([char_caption,
                             word_caption,
                             char_title,
                             word_title,
                             char_paragraph,
                             word_paragraph,
                             avg_paragraph])
char_feature = char_feature.T
char_feature.rename(columns = {0: 'char_caption',
                               1: 'word_caption',
                               2: 'char_title',
                               3: 'word_title',
                               4: 'char_paragraph',
                               5: 'word_paragraph',
                               6: 'avg_paragraph',
                              },inplace = True)

In [5]:
char_feature.describe()

Unnamed: 0,char_caption,word_caption,char_title,word_title,char_paragraph,word_paragraph,avg_paragraph
count,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0
mean,585.35473,83.002182,80.443333,13.201755,3637.643406,594.734464,5.224603
std,2605.827071,329.69662,152.086847,23.953162,4226.469811,706.772198,1.348661
min,2.0,1.0,4.0,1.0,2.0,1.0,2.0
25%,39.0,4.0,56.0,9.0,1464.0,237.0,5.0
50%,142.0,20.0,67.0,11.0,2650.0,429.0,5.2
75%,458.0,69.0,80.0,13.0,4643.0,757.0,5.4
max,144852.0,16316.0,4038.0,651.0,199672.0,33306.0,112.0


## 2.2 Text feature 

counting certain character in text

In [6]:
num_at = []
num_acronym = []
num_baity = []#!
num_cap = []
num_digit = []
num_exclm = []
num_money = []
num_ques = []
num_quote = []#!
num_tag = []
num_pic = []
num_parenthesis = []
is_start_num = []
is_superlative = []
is_start5w1h = []

table_currency = ['¥','$','€','£','￠']
table_bracket = ['(',')','[',']','{','}']
table_quote = ["'m","'re","'ve","'d","'s","s'"]
table_baity = ["click here","exclusive","won't believe","happen next","don't want","you know"]
table_5w1h = ["what","why","when","who","which","how"]

for caption in data['targetCaptions']:
    num_pic.append(len(set(caption)))  

for sentences in data['targetTitle']:
    sentences = str(sentences)
    tmp = sentences.split()
    s = str(tmp[0])
    
    question_mark = 0
    start_digit = -1
    start_5w1h = -1
    parenthesis = 0
    superlative = -1
    exclamation = 0
    digital = 0
    acronym = 0
    capital = 0
    money = 0
    baity = 0
    quote = 0
    start = 0
    hash_tag = 0
    at = 0
    
    text = nltk.word_tokenize(sentences)
    part_of_speech = nltk.pos_tag(text)
    for token,tag in part_of_speech:
        if tag in ['RBS','JJS']:
            superlative  = 1
            
    if s in table_5w1h:
        start_5w1h = 1
    if s.isdigit():
        start_digit = 1
        
    for word in sentences.split():        
        if len(word) <= 5:
            acronym += 1
        if len(word) > 5:
            if word.isupper():
                capital += 1                
        for char in word:
            if char == '!':
                exclamation += 1
            if char == '?':
                question_mark += 1
            if char in table_currency:
                money += 1
            if char in table_bracket:
                parenthesis += 1
            if char == '@':
                at += 1
            if char == '#':
                hash_tag += 1
            if char.isdigit:
                digital += 1
    
    num_acronym.append(acronym)
    num_at.append(at)
    num_cap.append(capital)
    num_digit.append(digital)
    num_exclm.append(exclamation)
    num_money.append(money)
    num_parenthesis.append(parenthesis)
    num_ques.append(question_mark)
    num_tag.append(hash_tag)
    is_start5w1h.append(start_5w1h)
    is_start_num.append(start_digit)
    is_superlative.append(superlative)
#     num_baity.append(baity)
#     num_quote.append(quote)


In [7]:
title_feature = pd.DataFrame([num_acronym,
                             num_at,
                             num_cap,
                             num_digit,
                             num_exclm,
                             num_money,
                             num_parenthesis,
                             num_ques,
                             num_tag,
                             num_pic,
                             is_start_num, 
                             is_superlative,
#                              is_start5w1h
                               ])
title_feature = title_feature.T
title_feature.rename(columns = {0:'num_acronym',
                               1:'num_at',
                               2:'num_cap',
                               3:'num_digit',
                               4:'num_exclm',
                               5:'num_money',
                               6:'num_parenthesis',
                               7:'num_ques',
                               8:'num_tag',
                               9:'num_pic',
                               10:'is_start_num',  
                               11:'is_superlative',
#                                10:'is_start5w1h'
                              },inplace = True)

In [8]:
title_feature.describe()

Unnamed: 0,num_acronym,num_at,num_cap,num_digit,num_exclm,num_money,num_parenthesis,num_ques,num_tag,num_pic,is_start_num,is_superlative
count,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0,21997.0
mean,7.717598,0.000409,0.019685,68.190526,0.022639,0.028868,0.028186,0.052553,0.002682,5.487612,-0.917989,-0.91017
std,13.360093,0.020224,0.15054,128.193395,0.154158,0.179241,0.236523,0.258801,0.054294,10.244213,0.396615,0.414245
min,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0
25%,5.0,0.0,0.0,47.0,0.0,0.0,0.0,0.0,0.0,1.0,-1.0,-1.0
50%,6.0,0.0,0.0,57.0,0.0,0.0,0.0,0.0,0.0,2.0,-1.0,-1.0
75%,9.0,0.0,0.0,68.0,0.0,0.0,0.0,0.0,0.0,6.0,-1.0,-1.0
max,380.0,1.0,3.0,3388.0,3.0,4.0,4.0,16.0,2.0,370.0,1.0,1.0


## 2.3 Informality and Forward Reference features
+ CLScore = 0.0588 * L - 0.296 * S - 15.8  

   L = average number of letters     
   S = average number of sentence per 100 words  
   

+ RIX = LW/S  

+ LIX = W/S + (100*LW)/W  

   W = number of words     
   LW number of long words(7+ characters)     
   S = number of sentence  
   
   
+ Formality Measure (fmeasure):  

   (nounfreq+adjectivefreq+prepositionfreq+particlefreq-pronounfreq-verbfreq-adverbfreq-interjectionfreq+100)*0.5   
   

+ Sentiment Analysis:  

   extract polarity as it indicates the sentiment as value nearer to 1 means a positive sentiment and values nearer to -1 means a negative sentiment.

In [9]:
RIX = []
LIX = []
sentiment = []
CLScore = []
f_measure = []

for paragraph in data['targetParagraphs']:
    
    avg_letter = 0
    avg_sentence = 0
    num_word = 0
    num_char = 0
    num_sentence = 0
    long_word = 0
    noun_freq = 0
    adjective_freq = 0
    preposition_freq = 0
    particle_freq = 0
    pronoun_freq = 0
    verb_freq = 0
    adverb_freq = 0
    interjection_freq = 0  
    measure = 0
    
    paragraph = str(paragraph)
    text = nltk.word_tokenize(paragraph)
    part_of_speech = nltk.pos_tag(text)
    for token,tag in part_of_speech:
        if tag in ['NN','NNS','NNP','NNPS']:
            noun_freq += 1
        if tag in ['VB','VBD','VBG','VBN','VBP','VBZ']:
            verb_freq += 1
        if tag == 'UH':
            interjection_freq += 1
        if tag in ['RB','RBS','RBR']:
            adverb_freq += 1
        if tag == 'RP':
            particle_freq += 1
        if tag in ['JJ','JJR','JJS'] :
            adjective_freq += 1
        if tag == 'in':
            preposition_freq += 1
        if tag in ['WRB','WP$','WP','PRP$','PRP']:
            pronoun_freq += 1
    measure = (noun_freq+adjective_freq+preposition_freq+particle_freq-pronoun_freq-verb_freq-adverb_freq-interjection_freq)
        
    for word in paragraph.split():
        num_word += 1
        num_char += len(word)
        if len(word) >= 7:
            long_word += 1
        for char in word:
            if char == '.':
                num_sentence += 1
    
    h_word = num_word//100
    if h_word == 0:
        h_word = 1
    if num_sentence == 0:
        num_sentence = 1
    if num_word == 0:
        num_word = 1

    avg_letter = round((num_char / num_word), 1)
    avg_sentence = round((num_sentence / h_word), 1)
    RIX.append(round(long_word/num_sentence, 1))
    CLScore.append(round(0.0588*avg_letter-0.296*avg_sentence-15.8, 1))
    sentiment.append(round(TextBlob(paragraph).sentiment[0], 1))
    LIX.append(round(num_word/num_sentence + (100*long_word)/num_word, 1))
    f_measure.append((measure+100)/2)
    

In [10]:
content_feature = pd.DataFrame([RIX,
                             LIX,
                             sentiment,
                             CLScore,
                             f_measure,
                               ])
content_feature = content_feature.T
content_feature.rename(columns = {0:'RIX',
                               1:'LIX',
                               2:'sentiment',
                               3:'CLScore',
                               4:'f_measure',
                              },inplace = True)

In [11]:
content_feature.describe()

Unnamed: 0,RIX,LIX,sentiment,CLScore,f_measure
count,21997.0,21997.0,21997.0,21997.0,21997.0
mean,5.565254,47.946152,0.095872,-17.39679,96.145906
std,3.774795,12.432667,0.110498,0.930461,69.958521
min,0.0,1.0,-1.0,-35.2,-521.5
25%,4.0,42.3,0.0,-17.7,63.0
50%,5.2,47.5,0.1,-17.3,81.5
75%,6.5,52.5,0.1,-16.9,111.0
max,142.0,515.2,1.0,-10.1,3373.5


## 2.4 Similarity between Title and Top 5 Sentences

Similarity between the title and the top one,two,three,four and five sentences of the body. Using tf-idf encoding to compute the similarity and removed stopwords

In [12]:
top1sim = []
top2sim = []
top3sim = []
top4sim = []
top5sim = []

for i in range(len(data)):
    
    title = data['targetTitle'][i]
    body = data['targetParagraphs'][i]
    sim_1 = sim_2 = sim_3 = sim_4 = sim_5 = 0.0
    sen_1 = sen_2 = sen_3 = sen_4 = sen_5 = ''
    
    if len(body) == 0:
        top1sim.append(sim_1)
        top2sim.append(sim_2)
        top3sim.append(sim_3)
        top4sim.append(sim_4)
        top5sim.append(sim_5)
        continue
    if len(body) >= 1:
        sen_1 = body[0]
    if len(body) >= 2:
        sen_2 = body[1]
    if len(body) >= 3:
        sen_3 = body[2]
    if len(body) >= 4:
        sen_4 = body[3]
    if len(body) >= 5:
        sen_5 = body[4]
    
    essay = str(title + ' ' + sen_1 + ' ' + sen_2 + ' '  + sen_3 + ' ' + sen_4 + ' ' + sen_5)
    tfidf = TfidfVectorizer(lowercase = True, analyzer = 'word', stop_words = 'english',ngram_range = (1,1))
    train_vect = tfidf.fit_transform([essay])    
    title_vect = tfidf.transform([title])
    
    s1_vect = tfidf.transform([sen_1])
    sim_1 = round(float(cosine_similarity(title_vect,s1_vect)),1)
    top1sim.append(sim_1)
    
    s2_vect = tfidf.transform([sen_2])
    sim_2 = round(float(cosine_similarity(title_vect,s2_vect)),1)
    top2sim.append(sim_2)
    
    s3_vect = tfidf.transform([sen_3])
    sim_3 = round(float(cosine_similarity(title_vect,s3_vect)),1)
    top3sim.append(sim_3)
    
    s4_vect = tfidf.transform([sen_4])
    sim_4 = round(float(cosine_similarity(title_vect,s4_vect)),1)
    top4sim.append(sim_4)
    
    s5_vect = tfidf.transform([sen_5])
    sim_5 = round(float(cosine_similarity(title_vect,s5_vect)),1)
    top5sim.append(sim_5)

In [13]:
sim_feature = pd.DataFrame([top1sim,
                             top2sim,
                             top3sim,
                             top4sim,
                             top5sim,
                               ])
sim_feature = sim_feature.T
sim_feature.rename(columns = {0:'top1sim',
                               1:'top2sim',
                               2:'top3sim',
                               3:'top4sim',
                               4:'top5sim',
                              },inplace = True)

In [14]:
sim_feature.describe()

Unnamed: 0,top1sim,top2sim,top3sim,top4sim,top5sim
count,21997.0,21997.0,21997.0,21997.0,21997.0
mean,0.237628,0.166332,0.126017,0.11778,0.100714
std,0.208294,0.166482,0.139019,0.13668,0.122919
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.2,0.1,0.1,0.1,0.1
75%,0.4,0.3,0.2,0.2,0.2
max,1.0,1.0,1.0,1.0,1.0


# 3.Basic Pre-processing

## 3.1 Lower case Removing Punctuation Removal of Stop Words

In [15]:
en_corpus = stopwords.words('english')
stop_words = []
caption = []
content = []
title = []

for sentence in data['targetParagraphs']:
    sentence = str(sentence).replace('[^\w\s]','')
    sentence = re.sub('[\u0060|\u0021-\u002c|\u002e-\u002f|\u003a-\u003f|\u2200-\u22ff|\uFB00-\uFFFD|\u2E80-\u33FF]',' ',sentence)
    stop_cnt = 0
    string = ''
#     sentence = TextBlob(sentence).correct()
    for word in sentence.split():
        if word in en_corpus:
            stop_cnt += 1
        if word not in en_corpus:            
            string += ' ' + word.lower()
    stop_words.append(stop_cnt)
    content.append(string)    

for sentence in data['targetCaptions']:
    string = ''
    sentence = str(sentence).replace('[^\w\s]','')
    sentence = re.sub('[\u0060|\u0021-\u002c|\u002e-\u002f|\u003a-\u003f|\u2200-\u22ff|\uFB00-\uFFFD|\u2E80-\u33FF]',' ',sentence)
    for word in sentence.split():
        if word not in en_corpus:            
            string += ' ' + word.lower()
    caption.append(string) 

for sentence in data['targetTitle']:
    string = ''
    sentence = str(sentence).replace('[^\w\s]','')
    sentence = re.sub('[\u0060|\u0021-\u002c|\u002e-\u002f|\u003a-\u003f|\u2200-\u22ff|\uFB00-\uFFFD|\u2E80-\u33FF]',' ',sentence)
    for word in sentence.split():
        if word not in en_corpus:            
            string += ' ' + word.lower()
    title.append(string)
    
corpus = title + content + caption

## 3.2 Common / Rare words removal & Lemmatization

In [16]:
target_caption = []
target_content = []
target_title = []

common = pd.Series(' '.join(caption).split()).value_counts()[:10]
rare = pd.Series(' '.join(caption).split()).value_counts()[-10:]  

for sentence in caption:
    string = ''
    lemma = ''
    sentence = str(sentence)
    for word in sentence.split():
        if word not in common:
            if word  not in rare:
                lemma = Word(word).lemmatize()
                string += ' ' + lemma
    target_caption.append(string)

common = pd.Series(' '.join(content).split()).value_counts()[:10]
rare = pd.Series(' '.join(content).split()).value_counts()[-10:]  

for sentence in content:
    string = ''
    sentence = str(sentence)
    for word in sentence.split():
        if word not in common:
            if word  not in rare:
                lemma = Word(word).lemmatize()
                string += ' ' + lemma
    target_content.append(string)
    
common = pd.Series(' '.join(title).split()).value_counts()[:10]
rare = pd.Series(' '.join(title).split()).value_counts()[-10:]  

for sentence in title:
    string = ''
    sentence = str(sentence)
    for word in sentence.split():
        if word not in common:
            if word  not in rare:
                lemma = Word(word).lemmatize()
                string += ' ' + lemma
    target_title.append(string) \
    
target_corpus = []
for i in range(len(target_content)):
    total_context = str(target_caption[i] + target_content[i] + target_title[i])
    target_corpus.append(total_context)

# 4.Advance Text Processing
## 4.1 N-grams

In [17]:
TextBlob(corpus[0]).ngrams(2)

[WordList(['tony', 'nominees']),
 WordList(['nominees', 'craziest']),
 WordList(['craziest', 'moments']),
 WordList(['moments', 'stage'])]

## 4.2 TF-IDF

In [18]:
tfidf = TfidfVectorizer(max_features = 1024, lowercase = True, analyzer = 'word', stop_words = 'english',ngram_range = (1,1))
corpus_vect = tfidf.fit_transform(target_corpus)

vec_feature = []

for article in target_corpus:
    corpus_vect = tfidf.transform([article])
    vec_feature.append(corpus_vect)

tfidf_feature = []

for index in vec_feature:
    tmp = index.toarray().reshape(1024)
    tfidf_feature.append(tmp)
    
tfidf_feature = pd.DataFrame(tfidf_feature)

## 4.3 BoW

In [19]:
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(target_corpus)

train_bow

<21997x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 2535929 stored elements in Compressed Sparse Row format>

## 4.4 Word Embeddings

In [20]:
# glove_input_file = 'glove.840B.300d.txt'
# word2vec_output_file = 'glove.840B.300d.txt.word2vec'
# glove2word2vec(glove_input_file, word2vec_output_file)
# filename = 'glove.840B.300d.txt.word2vec'
# model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [21]:
# filename = 'glove.840B.300d.txt.word2vec'
# model = KeyedVectors.load_word2vec_format(filename, binary=False)

# 5.Model

In [23]:
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers = [
    KNeighborsClassifier(3),
    SVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression()]

classifiers_name = [
    "KNeighborsClassifier",
    "SVC",
   " DecisionTreeClassifier",
    "RandomForestClassifier",
    "AdaBoostClassifier",
    "GradientBoostingClassifier",
    "GaussianNB",
    "LinearDiscriminantAnalysis",
    "QuadraticDiscriminantAnalysis",
    "LogisticRegression"]

log_cols = ['Classifier','Accuracy']
log = pd.DataFrame(columns = log_cols)

sss = StratifiedShuffleSplit(n_splits = 10, test_size = 0.1, random_state = 0)

# acc_dict = {}
# mse_dict = {}

# handy_feature = pd.concat([char_feature,title_feature,content_feature,sim_feature,tfidf_feature], axis = 1)
# y = []
# for label in data['truthMean']:
#     y.append(math.floor(label*10))
# X = np.array(handy_feature)
# y = np.array(y)

# for train_index, test_index in sss.split(X,y):
    
#     print("Train Index:\n",train_index,"\nTest Index:\n",test_index)  
    
#     X_train, X_test = X[train_index],X[test_index]
#     y_train, y_test = y[train_index],y[test_index]
    
#     for clf in classifiers:
#         name = clf.__class__.__name__
#         clf.fit(X_train,y_train)
#         train_predictions = clf.predict(X_test)
#         acc = accuracy_score(y_test,train_predictions)
#         mse = mean_squared_error(y_test, y_pred)
#         if name in acc_dict:
#             acc_dict[name] += acc
#         else:
#             acc_dict[name] = acc
        
#         if name in mse_dict:
#             mse_dict[name] += mse
#         else:
#             mse_dict[name] = mse
# for clf in acc_dict:
#     acc_dict[clf] = acc_dict
#     log_entry = pd.DataFrame([[clf,acc_dict[clf]]],columns = log_cols)
    
# from sklearn.metrics import accuracy_score, log_loss
# from sklearn.metricsics import mean_squared_errorprint(log)

  from numpy.core.umath_tests import inner1d


In [25]:
from sklearn.metrics import accuracy_score,log_loss,mean_squared_error,precision_score,f1_score,confusion_matrix
from sklearn import metrics

handy_feature = pd.concat([title_feature,content_feature,sim_feature,tfidf_feature], axis = 1)
y = []
for label in data['truthMean']:
    if label >= 0.5:
        y.append(1)
    else:
        y.append(-1)
X = np.array(handy_feature)
y = np.array(y)


for train_index, test_index in sss.split(X,y):
    
    print("Train Index:\n",train_index,"\nTest Index:\n",test_index)  
    
    X_train, X_test = X[train_index],X[test_index]
    y_train, y_test = y[train_index],y[test_index]
    
    candidate_classifier.fit(X_train,y_train)
    train_predictions = candidate_classifier.predict(X_test)
    acc = accuracy_score(y_test,train_predictions)
    mse = mean_squared_error(y_test, train_predictions)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, train_predictions)
    auc = metrics.auc(fpr, tpr)
    print(acc,mse,auc)
    print(confusion_matrix(y_test, train_predictions))

Train Index:
 [17656 16400  4252 ... 16772 17094 11542] 
Test Index:
 [10864 20899 17734 ...  7067  4378  5115]
0.7840909090909091 0.8636363636363636 0.6282130879458966
[[1551  104]
 [ 371  174]]
Train Index:
 [11013  3244 12404 ... 11979   978  9197] 
Test Index:
 [ 1268 17329 19377 ...  6354  1835  9929]
0.8 0.8 0.6560159649657695
[[1558   97]
 [ 343  202]]
Train Index:
 [  750 13429  6061 ...  1253 20203 18677] 
Test Index:
 [ 4030 18640 19072 ... 11712 14646 19399]
0.8086363636363636 0.7654545454545455 0.6777543723495663
[[1551  104]
 [ 317  228]]
Train Index:
 [10165 20025  4695 ... 18858  8116  4754] 
Test Index:
 [ 2439 10864 14900 ... 21995 20204  3093]
0.8127272727272727 0.7490909090909091 0.678627456415089
[[1563   92]
 [ 320  225]]
Train Index:
 [10971 11322  6202 ...  8048 19030 21690] 
Test Index:
 [18079  7051 11968 ...  6594 17319 16912]
0.8072727272727273 0.7709090909090909 0.6780786607167605
[[1546  109]
 [ 315  230]]
Train Index:
 [ 6575  6777 17445 ... 20686 14031  2