In [1]:
import pandas as pd
import os
from argparse import Namespace
from itertools import chain
import numpy as np


# from nltk.stem import WordNetLemmatizer
# from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer

import gensim
from gensim.models import Word2Vec
import gensim.downloader as api
import vocabulary
import preprocessing as prep
import vectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
args = Namespace (
    path=r"C:\Users\alexc\Documents\MSC Data Science\Master_Thesis\Data\clickbait",
    preprocessing=None,
    dataset="clickbait",
    lemmatization = False,
    stemming = True
)

Function για να τραβήξω όσα αρχεία είναι σχετικά με clickbait στο directory και μετατροπή τους σε DataFrame

In [3]:
def data_to_df(directory_path=args.path, file_string=args.dataset):
    if isinstance(file_string, str):
        data = []
        for _, _, f in os.walk(directory_path):
            for file in f:
                if file_string in file:
                    with open(os.path.join(directory_path,  file), 'r', encoding = 'utf8') as fp:
                        for line in fp.read().splitlines():
                            if line:
                                data.append((line, file))
        data = pd.DataFrame(data, columns=['text_column', 'class_column'])
    else:
        raise Exception('Sorry, only string objects in file_string parameter(i.e. args.dataset)')
    return data

In [4]:
data = data_to_df()
data.head()

Unnamed: 0,text_column,class_column
0,Should I Get Bings,clickbait_data
1,Which TV Female Friend Group Do You Belong In,clickbait_data
2,"The New ""Star Wars: The Force Awakens"" Trailer...",clickbait_data
3,"This Vine Of New York On ""Celebrity Big Brothe...",clickbait_data
4,A Couple Did A Stunning Photo Shoot With Their...,clickbait_data


In [5]:
# data['class_column'][:19000].value_counts()

Παρακάτω κρατάω μόνο 200 instances από τα non_clickbait_data instances για να δημιουργήσω imbalanced dataset με IR 80.39

In [6]:
print(data['class_column'][:16500].value_counts())

data = data[:16500]
data = data.sample(frac=1, random_state=1).reset_index(drop=True)

clickbait_data        15999
non_clickbait_data      501
Name: class_column, dtype: int64


In [7]:
data.loc[data['class_column'] =='clickbait_data', 'class_column'] = 0
data.loc[data['class_column'] =='non_clickbait_data', 'class_column'] = 1
#data.query('class_column == `non_clickbait_data`')['class_column'] = 1

In [8]:
# data.apply(lambda : row[df['Courses'].isin(['Spark','PySpark'])])

In [9]:
data.head()

Unnamed: 0,text_column,class_column
0,What's The Weirdest Thing You've Gotten While ...,0
1,How It Feels To Have A Crush,0
2,"31 Funny Tweets That Are Way, Way Too Real For...",0
3,This Is The Most Epic Netflix And Chill Story ...,0
4,This Stunning Photo Captured The Split Second ...,0


In [10]:
len(data['text_column'])

16500

In [11]:
df_stem = prep.text_preprocessing(data, lemmatization=False, stemming=True)
df_stem.head()

Unnamed: 0,text_column,class_column
0,"[what, weirdest, thing, youv, gotten, trickortr]",0
1,"[feel, crush]",0
2,"[funni, tweet, way, way, real, writer]",0
3,"[epic, netflix, chill, stori, ever]",0
4,"[stun, photo, captur, split, second, hot, tea,...",0


In [12]:
df_lem = prep.text_preprocessing(data, lemmatization=True, stemming=False)
df_lem.head()

Unnamed: 0,text_column,class_column
0,"[whats, weirdest, thing, youve, gotten, tricko...",0
1,"[feel, crush]",0
2,"[funny, tweet, way, way, real, writer]",0
3,"[epic, netflix, chill, story, ever]",0
4,"[stunning, photo, captured, split, second, hot...",0


In [13]:
df_stem

Unnamed: 0,text_column,class_column
0,"[what, weirdest, thing, youv, gotten, trickortr]",0
1,"[feel, crush]",0
2,"[funni, tweet, way, way, real, writer]",0
3,"[epic, netflix, chill, stori, ever]",0
4,"[stun, photo, captur, split, second, hot, tea,...",0
...,...,...
16495,"[went, irl, derek, zooland, center, ridicul, c...",0
16496,"[delight, weird, gift, medic, nerd, life]",0
16497,"[demi, lovato, perform, next, giant, boner]",0
16498,"[regular, peopl, tri, catch, pass, nfl, quarte...",0


In [14]:
voc = vocabulary.Vocabulary(df_stem)

In [15]:
columns = voc.unique_words
target_class = df_stem['class_column'].tolist()

In [16]:
vec = vectorizer.Vectorizer(df_stem)
vec.convert_text_to_sequences()
vec.text_to_padded_sequences()
vec.padding_text()

#Να στειλω αναφορά και να ανεβάσω τον κώδικα στο github

In [17]:
vec.padded_sequences

[[1, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 9, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 11, 12, 13, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 16, 17, 18, 19, 20, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 21, 22, 23, 24, 25, 26, 27, 28, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 30, 31, 32, 33, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 34, 35, 36, 37, 38, 39, 40, 41, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 43, 44, 45, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 52, 53, 54, 55, 56, 57, 58, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 59, 32, 19, 60, 61, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 5, 62, 63, 64, 65, 66, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 3, 68, 55, 69, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 47, 26, 70, 71, 72, 73, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
 [1, 74, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [18]:
vec.tokenized_text_list

[['what', 'weirdest', 'thing', 'youv', 'gotten', 'trickortr'],
 ['feel', 'crush'],
 ['funni', 'tweet', 'way', 'way', 'real', 'writer'],
 ['epic', 'netflix', 'chill', 'stori', 'ever'],
 ['stun',
  'photo',
  'captur',
  'split',
  'second',
  'hot',
  'tea',
  'froze',
  'midair'],
 ['moment', 'made', 'us', 'proud'],
 ['oh', 'god', 'there', 'go', 'friday', 'night', 'light', 'reunion', 'texa'],
 ['favourit', 'pixar', 'charact', 'say'],
 ['peopl',
  'hashtag',
  'iwillnotapologizeforbeingdarkskin',
  'celebr',
  'complexion'],
 ['band', 'surpris', 'dancer', 'audit', 'music', 'video', 'best', 'way'],
 ['tell', 'us', 'stori', 'sexism', 'colleg'],
 ['thing', 'need', 'know', 'gabriel', 'iglesia', 'weight', 'loss'],
 ['what', 'worst', 'audit', 'horror', 'stori'],
 ['peopl', 'hot', 'martin', 'omalley', 'democrat', 'debat'],
 ['realiti', 'halloween'],
 ['illustr', 'tweet', 'guarante', 'make', 'lol'],
 ['solang', 'knowless', 'wed', 'ring', 'got', 'lost', 'mardi', 'gra'],
 ['lipstick', 'trick', 'p

In [19]:
# onehotvectorizer = vectorizer.OneHotVectorizer(tokenized_text_list=df_stem['text_column'])
# onehotvectorizer.convert_text_to_sequences()
# onehotvectorizer.padding_text_data()
# onehotvectorizer.calculate_mean_number_of_words_per_dataset()

In [20]:
set(vec.vocabulary.unique_words)

{'witcher',
 'nintendo',
 'meetup',
 'virginia',
 'comfi',
 'seinfeld',
 'holm',
 'instafam',
 'limb',
 'euro',
 'nycc',
 'sylvest',
 'masquerad',
 'tallish',
 'paw',
 'push',
 'dayton',
 'safer',
 'tap',
 'glad',
 'agoraphobia',
 'simpl',
 'gyno',
 'stack',
 'babe',
 'dorfman',
 'daria',
 'occup',
 'penelop',
 'conserv',
 'bafta',
 'obrien',
 'hot',
 'neon',
 'anoth',
 'foot',
 'dewantatum',
 'bich',
 'cooper',
 'downton',
 'vitiligo',
 'fittlesworth',
 'unluckiest',
 'mistakenli',
 'lightweight',
 'kappa',
 'herder',
 'irat',
 'facial',
 'probabl',
 'truce',
 'alway',
 'magazin',
 'epic',
 'deep',
 'gummi',
 'achiev',
 'adults',
 'urg',
 'modcloth',
 'ventriloquist',
 'lit',
 'kaley',
 'matcha',
 'offic',
 'miracl',
 'hail',
 'heatwav',
 'illus',
 'cisco',
 'dunder',
 'shootout',
 'newli',
 'hoda',
 'conrad',
 'superfood',
 'militia',
 'iran',
 'offend',
 'patridg',
 'provid',
 'hawk',
 'telenovela',
 'trigger',
 'adha',
 'oj',
 'vigil',
 'seen',
 'movement',
 'fifti',
 'militari',
 

In [21]:
# unique_words = vec.vocabulary.unique_words  
# text_data=df_stem['text_column'].tolist()     

# print(len(unique_words), type(len(unique_words)))
# bow_array = np.zeros((len(text_data),len(unique_words)))



# for row,text in enumerate(text_data):
#     bow_array[row] = [1 if word in text else 0 for word in unique_words]

# print(len([doc for doc in text_data]))

# print(len([column for column in unique_words]))
# print(bow_array.shape)

# bow_df = pd.DataFrame(
# data=bow_array,
# index=[column for column in text_data],
# columns=[doc for doc in unique_words]          
# )

# bow_df['class_column'] = df_stem['class_column'].tolist()


In [22]:
# bow_array = np.zeros((len(vec.tokenized_text_list), len(vec.vocabulary.unique_words)))

In [23]:
# bow_array.shape, len(vec.vocabulary.unique_words)

In [24]:
# bow_df

NameError: name 'bow_df' is not defined

In [None]:
# text = df_stem['text_column'].tolist()
# v = [doc for doc in text]
# len(v)

16500

In [None]:
# a = vectorize_text_to_bow_and_create_dataframe(df_stem,vec.vocabulary.unique_words)

3
9547
(16500, 9547)


ValueError: Shape of passed values is (16500, 9547), indices imply (3, 9547)

In [None]:
# a.head()

In [None]:
# bowvectorizer = vectorizer.BowVectorizer(df_stem)
# bowvectorizer.convert_text_to_sequences()
# bowvectorizer.text_to_padded_sequences()
#bowvectorizer.calculate_mean_and_median_number_of_words_per_text_per_dataset()
# bowvectorizer.padding_text()
# bow_df = bowvectorizer.vectorize_text_to_bow_and_create_dataframe()

ValueError: all arrays must be same length

In [None]:
# bow_df.head()

In [None]:
# print((onehotvectorizer.vectorized_data == bowvectorizer.vectorized_data).all())
# print((tfidfvectorizer.vectorized_data == 0).all())
# print((onehotvectorizer2.vectorized_data == tfidfvectorizer.vectorized_data).all())


In [None]:
# vec.tokenized_text_list[0]

In [25]:
tfidfvectorizer = vectorizer.TfIdfVectorizer(df_stem)
tfidfvectorizer.convert_text_to_sequences()
tfidfvectorizer.text_to_padded_sequences()
tfidfvectorizer.padding_text()
tfidf_df = tfidfvectorizer.create_dataframe()

In [26]:
df_stem.text_column[0]

['what', 'weirdest', 'thing', 'youv', 'gotten', 'trickortr']

In [27]:
[['tfidf_dfwhat', 'weirdest', 'thing', 'youv', 'gotten', 'trickortr','class_column']].sum(), tfidf_df[['what', 'weirdest', 'thing', 'youv', 'gotten', 'trickortr','class_column']].sum()

Unnamed: 0,what,weirdest,thing,youv,gotten,trickortr,class_column
what weirdest thing youv gotten trickortr,0.313219,0.437967,0.210315,0.334802,0.526191,0.526191,0
feel crush,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
funni tweet way way real writer,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
epic netflix chill stori ever,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
stun photo captur split second hot tea froze midair,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...
went irl derek zooland center ridicul cool,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
delight weird gift medic nerd life,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
demi lovato perform next giant boner,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
regular peopl tri catch pass nfl quarterback,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
new_text = [' '.join(i) for i in df_stem.text_column]

In [26]:
new_text_df = pd.DataFrame({'text_column':new_text,'class_column':df_stem.class_column.tolist()})

In [27]:
new_text_df

Unnamed: 0,text_column,class_column
0,what weirdest thing youv gotten trickortr,0
1,feel crush,0
2,funni tweet way way real writer,0
3,epic netflix chill stori ever,0
4,stun photo captur split second hot tea froze m...,0
...,...,...
16495,went irl derek zooland center ridicul cool,0
16496,delight weird gift medic nerd life,0
16497,demi lovato perform next giant boner,0
16498,regular peopl tri catch pass nfl quarterback,0


In [28]:
tf = TfidfVectorizer()
X=tf.fit_transform(new_text)
tfidf_tokens = tf.get_feature_names_out()

In [29]:
'tri' in tfidf_tokens

True

In [30]:
tf_idf_dataset = pd.DataFrame(
data=X.toarray(),
index=[doc for doc in new_text], 
columns=tfidf_tokens
)

tf_idf_dataset['class_column'] = df_stem['class_column'].tolist()

In [31]:
tf_idf_dataset[['what', 'weirdest', 'thing', 'youv', 'gotten', 'trickortr','class_column']]

Unnamed: 0,what,weirdest,thing,youv,gotten,trickortr,class_column
what weirdest thing youv gotten trickortr,0.313219,0.437967,0.210315,0.334802,0.526191,0.526191,0
feel crush,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
funni tweet way way real writer,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
epic netflix chill stori ever,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
stun photo captur split second hot tea froze midair,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...
went irl derek zooland center ridicul cool,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
delight weird gift medic nerd life,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
demi lovato perform next giant boner,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
regular peopl tri catch pass nfl quarterback,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
def create_tfidf_vectors(dataset):
    
    new_text = [' '.join(i) for i in dataset.text_column] 
    
    
    vectorizer = TfidfVectorizer() 
    # TD-IDF Matrix
    X = vectorizer.fit_transform(new_text)
    # extracting feature names
    tfidf_tokens = vectorizer.get_feature_names_out()
    
    tf_idf_dataset = pd.DataFrame(
        data=X.toarray(),
        index=[doc for doc in new_text], 
        columns=tfidf_tokens
    )
    
    
    tf_idf_dataset["class_column"] = dataset['class_column'].tolist()
    
    return tf_idf_dataset

In [33]:
df_stem

Unnamed: 0,text_column,class_column
0,"[what, weirdest, thing, youv, gotten, trickortr]",0
1,"[feel, crush]",0
2,"[funni, tweet, way, way, real, writer]",0
3,"[epic, netflix, chill, stori, ever]",0
4,"[stun, photo, captur, split, second, hot, tea,...",0
...,...,...
16495,"[went, irl, derek, zooland, center, ridicul, c...",0
16496,"[delight, weird, gift, medic, nerd, life]",0
16497,"[demi, lovato, perform, next, giant, boner]",0
16498,"[regular, peopl, tri, catch, pass, nfl, quarte...",0


In [34]:
tfidf_df = create_tfidf_vectors(df_stem)

(16500, 9525)
----------------------------------
tfidf_tokens
['aap' 'aaron' 'ab' ... 'zooland' 'zowi' 'zuckerberg']
----------------------------------
9525


In [36]:
tfidf_df[['what', 'weirdest', 'thing', 'youv', 'gotten', 'trickortr','class_column']]

Unnamed: 0,what,weirdest,thing,youv,gotten,trickortr,class_column
what weirdest thing youv gotten trickortr,0.313219,0.437967,0.210315,0.334802,0.526191,0.526191,0
feel crush,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
funni tweet way way real writer,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
epic netflix chill stori ever,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
stun photo captur split second hot tea froze midair,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
...,...,...,...,...,...,...,...
went irl derek zooland center ridicul cool,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
delight weird gift medic nerd life,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
demi lovato perform next giant boner,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0
regular peopl tri catch pass nfl quarterback,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0


In [None]:
df_stem.head()

Unnamed: 0,text_column,class_column
0,"[what, weirdest, thing, youv, gotten, trickortr]",0
1,"[feel, crush]",0
2,"[funni, tweet, way, way, real, writer]",0
3,"[epic, netflix, chill, stori, ever]",0
4,"[stun, photo, captur, split, second, hot, tea,...",0


In [None]:
tfidf_df['class_column'] = df_stem['class_column']

In [None]:
'abbey' in voc.unique_words

True

In [None]:
# for row,array in enumerate(bowvectorizer.vectorized_data):
#     for index,element in enumerate(array):
#         if element > 2:
#             print('row in dataframe:{}'.format(row))
#             print('value in array:{}'.format(element))
#             print('actual word:{}'.format(voc.unique_words[index]))
#             print('index in array:{}'.format(index))
#             print('bow data array value:{} in the row:{} and index:{}'.format(bowvectorizer.vectorized_data[row][index],row,index))
#             print('-------------------------------------------------')

In [None]:
# one_hot_df = pd.DataFrame(onehotvectorizer.vectorized_data,columns=[word for word in columns])
# one_hot_df['target_class'] = target_class


In [None]:
# bow_df = pd.DataFrame(bowvectorizer.vectorized_data,columns=[word for word in columns])
# bow_df['target_class'] = target_class

In [None]:
# tfidf_df = pd.DataFrame(tfidfvectorizer.vectorized_data,columns=[word for word in columns])
# tfidf_df['target_class'] = target_class

In [None]:
# one_hot_df.head()

Unnamed: 0,PAD,SOS,EOS,cappella,mountain,edm,broadcast,special,phila,stolen,...,jello,bathtim,beaver,princip,squeez,raisin,drown,sobrieti,denver,target_class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
# bow_df.head()

Unnamed: 0,PAD,SOS,EOS,cappella,mountain,edm,broadcast,special,phila,stolen,...,jello,bathtim,beaver,princip,squeez,raisin,drown,sobrieti,denver,target_class
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [None]:
# tfidf_df.head()

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
df_stem['class_column'].tolist()

0        0
1        0
2        0
3        0
4        0
        ..
16495    0
16496    0
16497    0
16498    0
16499    0
Name: class_column, Length: 16500, dtype: object

In [None]:
tfidf_df['class_column'] = df_stem['class_column'].tolist()


In [None]:
tfidf_df

Unnamed: 0,aap,aaron,ab,abandon,abbey,abbi,abc,abduct,abercrombi,abhishek,...,zombi,zomg,zone,zoo,zooey,zookeep,zooland,zowi,zuckerberg,class_column
what weirdest thing youv gotten trickortr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0
feel crush,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0
funni tweet way way real writer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0
epic netflix chill stori ever,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0
stun photo captur split second hot tea froze midair,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
went irl derek zooland center ridicul cool,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.419235,0.0,0.0,0
delight weird gift medic nerd life,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0
demi lovato perform next giant boner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0
regular peopl tri catch pass nfl quarterback,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0


In [None]:
X = tfidf_df[tfidf_df.columns[:-1]]
y = tfidf_df['class_column']
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,random_state=42)

In [None]:
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train,y_train)

In [None]:
ada_model = AdaBoostClassifier()
ada_model.fit(X_train,y_train)

In [None]:
random_y_pred = random_forest_model.predict(X_test)
adaboost_y_pred = ada_model.predict(X_test)

In [None]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

In [None]:
# Model Accuracy, how often is the classifier correct?
print("Random Forest Accuracy in tfidf data:",metrics.accuracy_score(y_test, random_y_pred))
print("Random Forest F1_score in tfidf data:",metrics.f1_score(y_test, random_y_pred))
print("Random Forest Recall in tfidf data:",metrics.recall_score(y_test, random_y_pred))
print("Random Forest Precision in tfidf data:",metrics.precision_score(y_test, random_y_pred))


Random Forest Accuracy in tfidf data: 0.9728484848484849
Random Forest F1_score in tfidf data: 0.2222222222222222
Random Forest Recall in tfidf data: 0.128
Random Forest Precision in tfidf data: 0.8421052631578947


In [None]:
# Model Accuracy, how often is the classifier correct?
print("AdaBoost Accuracy in tfidf data:",metrics.accuracy_score(y_test, adaboost_y_pred))
print("AdaBoost F1_score in tfidf data:",metrics.f1_score(y_test, adaboost_y_pred))
print("AdaBoost Recall in tfidf data:",metrics.recall_score(y_test, adaboost_y_pred))
print("AdaBoost Precision in tfidf data:",metrics.precision_score(y_test, adaboost_y_pred))

AdaBoost Accuracy in tfidf data: 0.9716363636363636
AdaBoost F1_score in tfidf data: 0.22516556291390727
AdaBoost Recall in tfidf data: 0.136
AdaBoost Precision in tfidf data: 0.6538461538461539
