In [50]:
# import neccessary libraries that may be useful

import re
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [51]:
# import language processing functions

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
# Data importing

all_data = []
csv_dir = './YouTube-Spam-Collection-v1/'
csv_files = ['Youtube01-Psy.csv','Youtube02-KatyPerry.csv','Youtube03-LMFAO.csv','Youtube04-Eminem.csv','Youtube05-Shakira.csv']

for file in csv_files:
    data = pd.read_csv(csv_dir + file)
    all_data.append(data)
all_data = pd.concat(all_data)

# Sanity checkpoint
all_data.head()

Unnamed: 0,COMMENT_ID,AUTHOR,DATE,CONTENT,CLASS
0,LZQPQhLyRh80UYxNuaDWhIGQYNQ96IuCg-AYWqNPjpU,Julius NM,2013-11-07T06:20:48,"Huh, anyway check out this you[tube] channel: ...",1
1,LZQPQhLyRh_C2cTtd9MvFRJedxydaVW-2sNg5Diuo4A,adam riyati,2013-11-07T12:37:15,Hey guys check out my new channel and our firs...,1
2,LZQPQhLyRh9MSZYnf8djyk0gEF9BHDPYrrK-qCczIY8,Evgeny Murashkin,2013-11-08T17:34:21,just for test I have to say murdev.com,1
3,z13jhp0bxqncu512g22wvzkasxmvvzjaz04,ElNino Melendez,2013-11-09T08:28:43,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,z13fwbwp1oujthgqj04chlngpvzmtt3r3dw,GsMega,2013-11-10T16:05:38,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [53]:
# Data imbalance check (no issue here)
all_data['CLASS'].value_counts()

1    1005
0     951
Name: CLASS, dtype: int64

In [54]:
# Data preprocessing / cleaning

# Only keep Comment content and Class label
all_data.drop(['COMMENT_ID','AUTHOR','DATE'], axis=1, inplace=True, errors='ignore')
all_data.head()

Unnamed: 0,CONTENT,CLASS
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1


In [55]:
# Data preprocessing / cleaning
def process_content(comment):
    edited_comment = " ".join(re.findall("[A-Za-z]+", comment.lower()))
    edited_comment = edited_comment.replace('\ufeff', '')
    edited_comment = re.sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)",'http', edited_comment)
    return edited_comment

In [56]:
def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
    return text.lower()

In [57]:
# all_data['PROCESSED CONTENT'] = all_data['CONTENT'].apply(process_content)
all_data['PROCESSED CONTENT'] = all_data['CONTENT'].apply(text_cleaner)
all_data.head()

Unnamed: 0,CONTENT,CLASS,PROCESSED CONTENT
0,"Huh, anyway check out this you[tube] channel: ...",1,"huh, anyway check out this you[tube] channel: ..."
1,Hey guys check out my new channel and our firs...,1,hey guys check out my new channel and our firs...
2,just for test I have to say murdev.com,1,just for test i have to say murdev.com
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1,me shaking my sexy ass on my channel enjoy ^_^ ﻿
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1,watch?v=vtarggvgtwq check this out .﻿


In [58]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(all_data['PROCESSED CONTENT'],all_data['CLASS'], test_size=0.2, random_state=69)

# Sanity checkpoint
print(x_train)
print(y_train)

# Print the shape train and test sets
print("x_train.shape = " + str(x_train.shape))
print("x_test.shape = " + str(x_test.shape))

413                       me and my big sister like you﻿
187    who else would give katy perry a good old migh...
39     its a good song and i like her video clip, bec...
294       :) i&#39;ll subscribe to you. you look nice :)
428                          watch this with sound off!﻿
                             ...                        
89     http://www.aaas.org/tech-i/vote#view/25874/217...
40                                     watching in 2015﻿
269    when i hear katy singing this, i cry. the song...
89     check out the new hot video by dante b called ...
378    subscribe!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!...
Name: PROCESSED CONTENT, Length: 1564, dtype: object
413    0
187    0
39     0
294    1
428    0
      ..
89     1
40     0
269    0
89     1
378    1
Name: CLASS, Length: 1564, dtype: int64
x_train.shape = (1564,)
x_test.shape = (392,)


In [59]:
# Feature extraction using Counter Vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(x_train)
x_test_counts = count_vect.transform(x_test)

In [60]:
# Term frequency - inverse document frequency
from sklearn.feature_extraction.text import TfidfTransformer

tranformer = TfidfTransformer()
x_train_tfidf = tranformer.fit_transform(x_train_counts)
x_test_tfidf = tranformer.transform(x_test_counts)

df = pd.DataFrame(x_train_tfidf[0].T.todense(), index=count_vect.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

              TF-IDF
sister      0.628545
big         0.515562
me          0.295294
like        0.286241
you         0.240679
my          0.239767
and         0.233625
pay         0.000000
payhip      0.000000
pe          0.000000
paša        0.000000
paul        0.000000
pc          0.000000
patriot     0.000000
pcs         0.000000
pdf         0.000000
pazzi       0.000000
peace       0.000000
patriarchs  0.000000
peaceful    0.000000
peach       0.000000
peep        0.000000
pen         0.000000
penis       0.000000
penny       0.000000




In [61]:
# Same as CountVectorizer + TfidfTransformer (https://towardsdatascience.com/tf-idf-explained-and-python-sklearn-implementation-b020c5e83275)

from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer=TfidfVectorizer(use_idf=True)
x_train_tfidf = tfIdfVectorizer.fit_transform(x_train)
x_test_tfidf = tfIdfVectorizer.transform(x_test)

df = pd.DataFrame(x_train_tfidf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

              TF-IDF
sister      0.628545
big         0.515562
me          0.295294
like        0.286241
you         0.240679
my          0.239767
and         0.233625
pay         0.000000
payhip      0.000000
pe          0.000000
paša        0.000000
paul        0.000000
pc          0.000000
patriot     0.000000
pcs         0.000000
pdf         0.000000
pazzi       0.000000
peace       0.000000
patriarchs  0.000000
peaceful    0.000000
peach       0.000000
peep        0.000000
pen         0.000000
penis       0.000000
penny       0.000000




In [62]:
# Create and train Logistic Regression model
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()
model_LR.fit(x_train_tfidf, y_train)

accuracy = model_LR.score(x_test_tfidf, y_test)
print(accuracy)

0.9744897959183674


In [63]:
# Create and train Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier

model_RFC = RandomForestClassifier()
model_RFC.fit(x_train_tfidf,y_train)

accuracy = model_RFC.score(x_test_tfidf, y_test)
print(accuracy)

0.9668367346938775


In [64]:
# Create and train Multi-Layer Perceptron model
from sklearn.neural_network import MLPClassifier

model_NN = MLPClassifier(hidden_layer_sizes=(20,40,40,20), activation='relu', solver='adam', max_iter=10000)
model_NN.fit(x_train_tfidf, y_train)

accuracy = model_NN.score(x_test_tfidf, y_test)
print(accuracy)

0.9540816326530612


In [65]:
# Create and train XGBClassifier
from xgboost import XGBClassifier

model_XGB = XGBClassifier(objective = 'binary:logistic', max_depth = 4, alpha = 10, learning_rate = 1.0, n_estimators = 100)
model_XGB.fit(x_train_tfidf, y_train)

accuracy = model_XGB.score(x_test_tfidf, y_test)
print(accuracy)

0.9158163265306123




In [66]:
# # To improve, can use Grid Search to find best parameters

# # Try Grid Search with Random Forest Classifier

# from sklearn.model_selection import GridSearchCV

# parameters = {
#                 'n_estimators': [80, 100, 120],
#                 'bootstrap': [True, False],
#                 'criterion' : ['gini', 'entropy']
#              }

# model_RFC_GSCV = GridSearchCV(RandomForestClassifier(), parameters)
# model_RFC_GSCV.fit(x_train_tfidf, y_train)

# print(model_RFC_GSCV.best_params_)

# accuracy = model_RFC_GSCV.score(x_test_tfidf, y_test)
# print(accuracy)

In [67]:
# Also, let's try Naive Bayes method.

stopwords_english = stopwords.words('english') 
stemmer = PorterStemmer() 

In [68]:
def count_tweets(result, comments, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        tweets: a list of comments
        ys: a list corresponding to the class of each comment (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''
    ### START CODE HERE ###
    for y, comment in zip(ys, comments):
        comment_tokens = word_tokenize(process_content(comment))
        
        comment_stem = []

        for word in comment_tokens: # Go through every word in your tokens list
            if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
                
                stem_word = stemmer.stem(word)  # stemming word
                comment_stem.append(stem_word)  # append to the list
                
        for word in comment_stem:
            # define the key, which is the word and label tuple
            pair = (word, y)
            
            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1

            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    ### END CODE HERE ###

    return result

In [69]:
# Build the freqs dictionary for later uses

freqs = count_tweets({}, x_train, y_train)

In [70]:
def train_naive_bayes(freqs, train_x, train_y):
    '''
    Input:
        freqs: dictionary from (word, label) to how often the word appears
        train_x: a list of tweets
        train_y: a list of labels correponding to the comments (0,1)
    Output:
        logprior: the log prior. (equation 3 above)
        loglikelihood: the log likelihood of you Naive bayes equation. (equation 6 above)
    '''
    loglikelihood = {}
    logprior = 0

    ### START CODE HERE ###

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = 0
    for pair in freqs.keys():
        # if the label is positive (greater than zero)
        if pair[1] > 0:

            # Increment the number of positive words by the count for this (word, label) pair
            N_pos += freqs.get(pair, 1)

        # else, the label is negative
        else:

            # increment the number of negative words by the count for this (word,label) pair
            N_neg += freqs.get(pair, 1)
    
    # Calculate D, the number of documents
    D = len(train_y)

    # Calculate D_pos, the number of positive documents
    D_pos = sum(train_y)

    # Calculate D_neg, the number of negative documents
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)
    
    # For each word in the vocabulary...
    for word in vocab:
        # get the positive and negative frequency of the word
        freq_pos = freqs.get((word, 1.0), 0)
        freq_neg = freqs.get((word, 0.0), 0)

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

    ### END CODE HERE ###

    return logprior, loglikelihood

In [71]:
logprior, loglikelihood = train_naive_bayes(freqs, x_train, y_train)

In [72]:
def naive_bayes_predict(comment, logprior, loglikelihood):
    '''
    Input:
        tweet: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the comment (if found in the dictionary) + logprior (a number)

    '''
    ### START CODE HERE ###
    # process the tweet to get a list of words
    word_l = word_tokenize(process_content(comment))

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]

    ### END CODE HERE ###

    return p

In [73]:
def test_naive_bayes(x_test, y_test, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict):
    """
    Input:
        test_x: A list of tweets
        test_y: the corresponding labels for the list of comments
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of comments classified correctly)/(total # of tweets)
    """
    accuracy = 0  # return this properly

    ### START CODE HERE ###
    y_hats = []
    for comment in x_test:
        # if the prediction is > 0
        if naive_bayes_predict(comment, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.sum(np.abs(y_hats - y_test)) / len(y_test)

    # Accuracy is 1 minus the error
    accuracy = 1 - error

    ### END CODE HERE ###

    return accuracy

In [74]:
print(test_naive_bayes(x_test, y_test, logprior, loglikelihood))

0.8775510204081632


In [75]:
# Error analysis of the above models

print('=== Logistic Regression Model Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, x_tfidf, y in zip(x_test, x_test_tfidf, y_test):
    y_hat = model_LR.predict(x_tfidf)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))
        
print('\n\n=== Random Forest Classifier Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, x_tfidf, y in zip(x_test, x_test_tfidf, y_test):
    y_hat = model_RFC.predict(x_tfidf)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))
        
print('\n\n=== Multi-Layer Perceptron Model Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, x_tfidf, y in zip(x_test, x_test_tfidf, y_test):
    y_hat = model_NN.predict(x_tfidf)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))

print('\n\n=== Naive Bayes Model Error Analysis ===\n')
print('Truth Predicted Tweet')
for x, y in zip(x_test, y_test):
    y_hat = naive_bayes_predict(x, logprior, loglikelihood)
    if y != (np.sign(y_hat) > 0):
        print('%d\t%0.2f\t%s' % (y, np.sign(y_hat) > 0, ' '.join(
            word_tokenize(process_content(x))).encode('ascii', 'ignore')))

=== Logistic Regression Model Error Analysis ===

Truth Predicted Tweet
1	0.00	b'follow follow vaahidmustafic like like'
1	0.00	b'o peoples of the earth i have seen how you perform every form of evil at your leisure you cease not from reveling in that which i hate behold you murder the innocent day and night and plot evil against your neighbor you stand up for the rights of those who commit abomination and clap your hands as wickedness is celebrated openly in the streets o most perverse and abominable generation shall i not repay hear the word of the lord trumpetcallofgodonline co m'
1	0.00	b'awesome share rteminem love the way you lie ft rihanna http ow ly zme f'
0	1.00	b'if you pause at at the last millisecond you can see that that chick is about to laugh takes a few tries'
0	1.00	b'if you are a person that loves real music you should listen to quot cruz supat quot he is awesome as fuck just as eminem used to be'
1	0.00	b'yea stil the best wk song ever thumbs up of you think the same

In [76]:
# Try stemming

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
    
stemmed_count_vect = StemmedCountVectorizer()

x_train_counts = stemmed_count_vect.fit_transform(x_train)
x_test_counts = stemmed_count_vect.transform(x_test)

tranformer = TfidfTransformer()
x_train_tfidf = tranformer.fit_transform(x_train_counts)
x_test_tfidf = tranformer.transform(x_test_counts)

df = pd.DataFrame(x_train_tfidf[0].T.todense(), index=stemmed_count_vect.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

               TF-IDF
sister       0.616851
big          0.522706
me           0.299386
like         0.286187
you          0.244014
my           0.243089
and          0.236862
pared        0.000000
palastin     0.000000
pan          0.000000
pander26     0.000000
panorama     0.000000
pant         0.000000
paper        0.000000
paragraph    0.000000
paranorm     0.000000
00           0.000000
parodi       0.000000
parri        0.000000
paint        0.000000
part         0.000000
parti        0.000000
partyman318  0.000000
pass         0.000000
passion      0.000000




In [77]:
# Create and train Logistic Regression model
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()
model_LR.fit(x_train_tfidf, y_train)

accuracy = model_LR.score(x_test_tfidf, y_test)
print(accuracy)

0.9744897959183674


In [78]:
# Create and train Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier

model_RFC = RandomForestClassifier()
model_RFC.fit(x_train_tfidf,y_train)

accuracy = model_RFC.score(x_test_tfidf, y_test)
print(accuracy)

0.9719387755102041


In [79]:
# Create and train Multi-Layer Perceptron model
from sklearn.neural_network import MLPClassifier

model_NN = MLPClassifier(hidden_layer_sizes=(20,40,40,20), activation='relu', solver='adam', max_iter=10000)
model_NN.fit(x_train_tfidf, y_train)

accuracy = model_NN.score(x_test_tfidf, y_test)
print(accuracy)

0.9591836734693877


In [80]:
# Create and train XGBClassifier
from xgboost import XGBClassifier

model_XGB = XGBClassifier(objective = 'binary:logistic', max_depth = 4, alpha = 10, learning_rate = 1.0, n_estimators = 100)
model_XGB.fit(x_train_tfidf, y_train)

accuracy = model_XGB.score(x_test_tfidf, y_test)
print(accuracy)

0.9285714285714286




In [81]:
# Try lemmatization

import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [82]:
lemmatizer = WordNetLemmatizer()

class LemmaTokenizer(object):
    def __call__(self, text):
        return [lemmatizer.lemmatize(t) for t in word_tokenize(text)]
    
lemmed_count_vect = CountVectorizer(tokenizer=LemmaTokenizer())

x_train_counts = lemmed_count_vect.fit_transform(x_train)
x_test_counts = lemmed_count_vect.transform(x_test)

tranformer = TfidfTransformer()
x_train_tfidf = tranformer.fit_transform(x_train_counts)
x_test_tfidf = tranformer.transform(x_test_counts)

df = pd.DataFrame(x_train_tfidf[0].T.todense(), index=lemmed_count_vect.get_feature_names(), columns=["TF-IDF"])
df = df.sort_values('TF-IDF', ascending=False)
print (df.head(25))

               TF-IDF
sister       0.551494
you﻿         0.494830
big          0.467324
me           0.269722
like         0.259459
my           0.217884
and          0.212280
!            0.000000
pie          0.000000
pile         0.000000
pigment      0.000000
piece        0.000000
picked       0.000000
picture      0.000000
pimpmyviews  0.000000
pic          0.000000
piano        0.000000
photograph   0.000000
pilot        0.000000
piss         0.000000
pink         0.000000
phone﻿       0.000000
pitbull      0.000000
pivot        0.000000
place        0.000000




In [83]:
# Create and train Logistic Regression model
from sklearn.linear_model import LogisticRegression

model_LR = LogisticRegression()
model_LR.fit(x_train_tfidf, y_train)

accuracy = model_LR.score(x_test_tfidf, y_test)
print(accuracy)

0.9540816326530612


In [84]:
# Create and train Random Forest Classifier model
from sklearn.ensemble import RandomForestClassifier

model_RFC = RandomForestClassifier()
model_RFC.fit(x_train_tfidf,y_train)

accuracy = model_RFC.score(x_test_tfidf, y_test)
print(accuracy)

0.9617346938775511


In [85]:
# Create and train Multi-Layer Perceptron model
from sklearn.neural_network import MLPClassifier

model_NN = MLPClassifier(hidden_layer_sizes=(20,40,40,20), activation='relu', solver='adam', max_iter=10000)
model_NN.fit(x_train_tfidf, y_train)

accuracy = model_NN.score(x_test_tfidf, y_test)
print(accuracy)

0.9336734693877551


In [86]:
# Create and train XGBClassifier
from xgboost import XGBClassifier

model_XGB = XGBClassifier(objective = 'binary:logistic', max_depth = 4, alpha = 10, learning_rate = 1.0, n_estimators = 100)
model_XGB.fit(x_train_tfidf, y_train)

accuracy = model_XGB.score(x_test_tfidf, y_test)
print(accuracy)

0.9311224489795918


