In [13]:
import pandas as pd
import re
import numpy as np

import nltk
from nltk.corpus import stopwords, wordnet 
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer, WordNetLemmatizer

import matplotlib.pyplot as plt

In [7]:
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /Users/ruyroa/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /Users/ruyroa/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ruyroa/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/ruyroa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [96]:
#for model-building
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [97]:
#for word embedding
import gensim
from gensim.models import Word2Vec

In [108]:
pd.set_option('display.max_columns', None)

### Data preprocessing

In [12]:
df = pd.read_csv("cyberbullying_tweets.csv")

In [14]:
# Function to remove non letters

# Regular expression for removing all non-letter characters in the file.
regex = re.compile('[^a-zA-Z ]')

def remove_non_letters(word):
    return regex.sub("", word)

In [15]:
def remove_stopwords(word, stopwords):
    a = [x for x in word.split(' ') if x not in stopwords]
    return ' '.join(a)

In [18]:
# Lemmatization
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

    # Tokenize the sentence
def lemmatizer(string, word_lemmatizer):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[word_lemmatizer.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return ' '.join(a)

In [19]:
# Function to remove the stopwords of an only letters string
def preprocess_string(string, stopwords, word_lemmatizer):
    return lemmatizer(remove_stopwords(remove_non_letters(string).lower(),
                                       stopwords), word_lemmatizer)

In [20]:
stopwords_en = stopwords.words('english')

WNL = WordNetLemmatizer()

In [21]:
df['clean_tweet_text'] = df.tweet_text.apply(lambda x : preprocess_string(x, stopwords_en, WNL))

In [22]:
df["clean_tok_tweet"] = df.clean_tweet_text.apply(word_tokenize)

In [23]:
df["len_tweet"] = df.tweet_text.apply(len)

In [24]:
df.groupby(by = ["cyberbullying_type"]).agg({"len_tweet":[np.mean, np.std]})

Unnamed: 0_level_0,len_tweet,len_tweet
Unnamed: 0_level_1,mean,std
cyberbullying_type,Unnamed: 1_level_2,Unnamed: 2_level_2
age,173.542042,80.052851
ethnicity,139.32006,76.774127
gender,136.4223,71.352681
not_cyberbullying,83.107363,45.510016
other_cyberbullying,85.713281,91.682049
religion,197.999,71.941532


### Using Bag of Words vectorizer

In [145]:
X_train, X_test, y_train, y_test = train_test_split(df["clean_tweet_text"],
                                                    df["cyberbullying_type"],
                                                    test_size=0.3,
                                                    random_state = 0,
                                                    shuffle=True)

In [146]:
# Bag of words vectorizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [159]:
help(DecisionTreeClassifier)

Help on class DecisionTreeClassifier in module sklearn.tree._classes:

class DecisionTreeClassifier(sklearn.base.ClassifierMixin, BaseDecisionTree)
 |  DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
 |  
 |  A decision tree classifier.
 |  
 |  Read more in the :ref:`User Guide <tree>`.
 |  
 |  Parameters
 |  ----------
 |  criterion : {"gini", "entropy"}, default="gini"
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "entropy" for the information gain.
 |  
 |  splitter : {"best", "random"}, default="best"
 |      The strategy used to choose the split at each node. Supported
 |      strategies are "best" to choose the best split and "random" to choose
 |      the best random split.
 |  
 |  max_d

In [157]:
# Basic Decision Tree Classifier
clf = DecisionTreeClassifier().fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_test_tfidf)

In [158]:
print(accuracy_score(y_pred, y_test))
print("")
print(confusion_matrix(y_test, y_pred))

0.7937517472742521

[[2292    5    7   53   21    4]
 [   6 2346    9   23   30   10]
 [   5   12 1988  185  214    6]
 [  36   11  139 1220  901   86]
 [  22   13  138  825 1274   15]
 [   1   10   18  105   41 2237]]


In [162]:
clf.get_depth()

1457

In [188]:
pgrid = {"max_depth": [10, 20, 75, 100],
      "min_samples_split": [5, 10, 15, 25]}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state = 0), param_grid=pgrid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid={'max_depth': [10, 20, 75, 100],
                         'min_samples_split': [5, 10, 15, 25]})

In [189]:
print(grid_search.best_params_)
y_pred = grid_search.best_estimator_.predict(X_test_tfidf)
print(accuracy_score(y_pred, y_test))
print("")
print(confusion_matrix(y_test, y_pred))

{'max_depth': 75, 'min_samples_split': 15}
0.8122728543472183

[[2300    3    4   47   27    1]
 [   6 2348   11   10   42    7]
 [   5   14 1946  129  312    4]
 [  41   14  117  884 1274   63]
 [  20   11   96  235 1919    6]
 [   2   10   13   76   86 2225]]


In [172]:
from sklearn.ensemble import BaggingClassifier

In [186]:
clf_bagg = BaggingClassifier(DecisionTreeClassifier(random_state = 0, max_depth = 75,
                                                        min_samples_split = 15), 
                             max_samples=0.5, max_features=0.5, n_estimators=50)
clf_bagg.fit(X_train_tfidf, y_train)

y_pred_bagg = clf_bagg.predict(X_test_tfidf)

In [187]:
print(accuracy_score(y_pred_bagg, y_test))
print("")
print(confusion_matrix(y_test, y_pred_bagg))

0.8279284316466312

[[2332    2    0   30   17    1]
 [   4 2376    4    8   28    4]
 [   4   11 1965  164  261    5]
 [  46   17   68 1050 1128   84]
 [  26   14   76  319 1825   27]
 [   0    8   11   36   59 2298]]


In [199]:
pgrid = {"max_samples": [0.4, 0.5, 0.6],
      "max_features": [0.4, 0.5, 0.6]}
grid_search_bagg = GridSearchCV(BaggingClassifier(DecisionTreeClassifier(random_state = 0, 
                                                                         max_depth = 75,
                                                                         min_samples_split = 15),
                                                 n_estimators = 50), 
                                param_grid=pgrid, cv=5)

grid_search_bagg.fit(X_train_tfidf, y_train)

GridSearchCV(cv=5,
             estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=75,
                                                                               min_samples_split=15,
                                                                               random_state=0),
                                         n_estimators=50),
             param_grid={'max_features': [0.4, 0.5, 0.6],
                         'max_samples': [0.4, 0.5, 0.6]})

In [197]:
print(grid_search_bagg.best_params_)
y_pred_bagg_gs = grid_search_bagg.best_estimator_.predict(X_test_tfidf)
print(accuracy_score(y_pred_bagg_gs, y_test))
print("")
print(confusion_matrix(y_test, y_pred_bagg_gs))

{'max_features': 0.5, 'max_samples': 0.5}
0.8270897400055913

[[2326    0    2   36   18    0]
 [   4 2369    7    9   31    4]
 [   6    9 1944  174  270    7]
 [  39   16   60 1060 1132   86]
 [  19   10   70  329 1839   20]
 [   0    4   12   33   67 2296]]


In [256]:
print(classification_report(y_test, y_pred_bagg_gs))

                     precision    recall  f1-score   support

                age       0.97      0.98      0.97      2382
          ethnicity       0.98      0.98      0.98      2424
             gender       0.93      0.81      0.86      2410
  not_cyberbullying       0.65      0.44      0.53      2393
other_cyberbullying       0.55      0.80      0.65      2287
           religion       0.95      0.95      0.95      2412

           accuracy                           0.83     14308
          macro avg       0.84      0.83      0.82     14308
       weighted avg       0.84      0.83      0.83     14308



In [313]:
rf_bw = RandomForestClassifier(n_estimators = 200, random_state = 0)
rf_bw.fit(X_train_tfidf, y_train)
y_pred = rf_bw.predict(X_test_tfidf)

In [314]:
print(accuracy_score(y_pred, y_test))

0.821917808219178


In [316]:
from sklearn.ensemble import AdaBoostClassifier

In [315]:
print(classification_report(y_test, y_pred))

                     precision    recall  f1-score   support

                age       0.97      0.98      0.97      2382
          ethnicity       0.99      0.98      0.99      2424
             gender       0.91      0.83      0.87      2410
  not_cyberbullying       0.59      0.49      0.53      2393
other_cyberbullying       0.54      0.68      0.60      2287
           religion       0.95      0.96      0.96      2412

           accuracy                           0.82     14308
          macro avg       0.83      0.82      0.82     14308
       weighted avg       0.83      0.82      0.82     14308



In [326]:
ada = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(random_state = 0, 
                                                                max_depth = 5,
                                                                min_samples_split = 2), 
                         n_estimators = 150, 
                         learning_rate = 0.1).fit(X_train_tfidf, y_train)

In [328]:
y_pred = ada.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_pred, y_test))

                     precision    recall  f1-score   support

                age       0.99      0.94      0.96      2382
          ethnicity       0.99      0.96      0.98      2424
             gender       0.96      0.66      0.78      2410
  not_cyberbullying       0.44      0.38      0.41      2393
other_cyberbullying       0.47      0.85      0.61      2287
           religion       0.98      0.73      0.84      2412

           accuracy                           0.75     14308
          macro avg       0.80      0.76      0.76     14308
       weighted avg       0.81      0.75      0.76     14308

0.7546128040257198


In [329]:
from sklearn.ensemble import GradientBoostingClassifier

In [335]:
gbc = GradientBoostingClassifier(n_estimators=150, learning_rate=0.1) 
gbc.fit(X_train_tfidf, y_train) 

GradientBoostingClassifier(n_estimators=150)

In [337]:
y_pred = gbc.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_pred, y_test))

                     precision    recall  f1-score   support

                age       0.98      0.97      0.97      2382
          ethnicity       0.99      0.97      0.98      2424
             gender       0.92      0.83      0.87      2410
  not_cyberbullying       0.70      0.42      0.53      2393
other_cyberbullying       0.55      0.87      0.67      2287
           religion       0.97      0.92      0.94      2412

           accuracy                           0.83     14308
          macro avg       0.85      0.83      0.83     14308
       weighted avg       0.85      0.83      0.83     14308

0.8313530891808778


In [142]:
df.cyberbullying_type.describe()

count        47692
unique           6
top       religion
freq          7998
Name: cyberbullying_type, dtype: object

### Using Word2Vec for the embedding

In [225]:
X_train_tok, X_test_tok, y_train_w2v, y_test_w2v = train_test_split(df["clean_tok_tweet"],
                                                    df["cyberbullying_type"],
                                                    test_size=0.3,
                                                    random_state = 0,
                                                    shuffle=True)

In [200]:
#building Word2Vec model
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(next(iter(word2vec.values())))
    def fit(self, X, y):
            return self
    def transform(self, X):
            return np.array([
                np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                        or [np.zeros(self.dim)], axis=0)
                for words in X])

In [254]:
from sklearn.metrics import classification_report

In [252]:
help(Word2Vec)

Help on class Word2Vec in module gensim.models.word2vec:

class Word2Vec(gensim.utils.SaveLoad)
 |  Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None)
 |  
 |  Method resolution order:
 |      Word2Vec
 |      gensim.utils.SaveLoad
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=Fals

In [345]:
model = Word2Vec(df['clean_tok_tweet'],min_count=3, vector_size = 500, window = 3)
w2v = dict(zip(model.wv.index_to_key, model.wv.vectors)) 
modelw = MeanEmbeddingVectorizer(w2v)

In [284]:
X_train_w2v = modelw.transform(X_train_tok)
X_test_w2v = modelw.transform(X_test_tok)

In [295]:
X_train_w2v.shape

(33384, 1000)

In [299]:
clf = DecisionTreeClassifier(max_depth = 15, 
                             min_samples_split = 2,
                            min_samples_leaf = 2).fit(X_train_w2v, y_train_w2v)

In [297]:
y_pred_w2v = clf.predict(X_test_w2v)

print(classification_report(y_test_w2v, y_pred_w2v))

In [293]:
print(accuracy_score(y_pred_w2v, y_test_w2v))

0.7113502935420744


In [262]:
print(grid_search_w2v.best_params_)
print("")
y_pred_w2v = grid_search_w2v.best_estimator_.predict(X_test_w2v)
print("")
print(classification_report(y_test_w2v, y_pred_w2v))
print("")
print(accuracy_score(y_test_w2v, y_pred_w2v))
print("")
print(confusion_matrix(y_test_w2v, y_pred_w2v))

{'max_depth': 15, 'min_samples_leaf': 7, 'min_samples_split': 10}


                     precision    recall  f1-score   support

                age       0.87      0.90      0.88      2382
          ethnicity       0.83      0.82      0.82      2424
             gender       0.78      0.69      0.73      2410
  not_cyberbullying       0.47      0.41      0.44      2393
other_cyberbullying       0.42      0.52      0.46      2287
           religion       0.77      0.76      0.77      2412

           accuracy                           0.69     14308
          macro avg       0.69      0.68      0.69     14308
       weighted avg       0.69      0.69      0.69     14308


0.6854207436399217

[[2138   28   14   72   93   37]
 [  40 1984   68   71  176   85]
 [  47   68 1653  234  314   94]
 [  97   77  171  992  877  179]
 [  81  129  152  580 1198  147]
 [  48  111   51  144  216 1842]]


In [346]:
gbc_w2v = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5) 
gbc_w2v.fit(X_train_w2v, y_train_w2v) 

GradientBoostingClassifier(learning_rate=0.5, n_estimators=20)

In [347]:
y_pred_w2v = gbc_w2v.predict(X_test_w2v)

print(classification_report(y_test_w2v, y_pred_w2v))

                     precision    recall  f1-score   support

                age       0.94      0.96      0.95      2382
          ethnicity       0.90      0.90      0.90      2424
             gender       0.86      0.72      0.79      2410
  not_cyberbullying       0.56      0.48      0.52      2393
other_cyberbullying       0.49      0.61      0.54      2287
           religion       0.86      0.89      0.88      2412

           accuracy                           0.76     14308
          macro avg       0.77      0.76      0.76     14308
       weighted avg       0.77      0.76      0.76     14308



In [36]:
words_registered = []
words_count = []
words_index = dict()
i = 0
for line in df.iloc[:,0]:
    word_list = preprocess_string(line, stopwords_en)
    for word in word_list:
        if word not in words_registered:
            words_registered.append(word)
            words_count.append([word, 1])
            words_index[word] = i
            i += 1
        else:
            words_count[words_index[word]][1] += 1

In [43]:
sorted(words_count, key = lambda x : x[1], reverse = True)

[['', 17222],
 ['school', 8725],
 ['like', 5866],
 ['fuck', 5799],
 ['dumb', 5336],
 ['high', 5098],
 ['people', 4807],
 ['bullied', 4666],
 ['im', 4488],
 ['dont', 4407],
 ['nigger', 4318],
 ['rape', 4247],
 ['rt', 4230],
 ['u', 4106],
 ['gay', 4008],
 ['bully', 3613],
 ['jokes', 3470],
 ['one', 3402],
 ['get', 2846],
 ['girls', 2834],
 ['ass', 2794],
 ['girl', 2710],
 ['black', 2701],
 ['mkr', 2659],
 ['amp', 2608],
 ['muslims', 2424],
 ['muslim', 2345],
 ['know', 2340],
 ['white', 2191],
 ['think', 2129],
 ['would', 2054],
 ['joke', 2041],
 ['bitch', 2026],
 ['idiot', 1993],
 ['right', 1838],
 ['say', 1743],
 ['got', 1731],
 ['fucking', 1679],
 ['call', 1676],
 ['even', 1657],
 ['bullies', 1652],
 ['go', 1640],
 ['women', 1623],
 ['cant', 1616],
 ['make', 1612],
 ['youre', 1567],
 ['christian', 1536],
 ['never', 1528],
 ['see', 1525],
 ['ur', 1504],
 ['called', 1499],
 ['shit', 1496],
 ['time', 1483],
 ['idiots', 1470],
 ['woman', 1460],
 ['still', 1449],
 ['really', 1442],
 ['thats

### Using a pretrained GloVe embedding vector

In [25]:
embedd_len = 50

In [28]:
import gensim.downloader

In [29]:
glove_vectors = gensim.downloader.load(f'glove-twitter-{embedd_len}')



In [31]:
glove_vectors.add_vector(key = "not in voc", vector = np.zeros(embedd_len))
max_length = df.clean_tok_tweet.apply(len).max()

AttributeError: 'DataFrame' object has no attribute 'clean_tok'

In [107]:
df.clean_tok_tweet[df.clean_tok_tweet.apply(len) == max_length].iloc[0]

['feminazi',
 'actual',
 'word',
 'denotnasharchy',
 'job',
 'mean',
 'protect',
 'people',
 'even',
 'people',
 'dont',
 'agree',
 'withlikethey',
 'barely',
 'cook',
 'anything',
 'entre',
 'mkrrt',
 'kf',
 'like',
 'community',
 'foster',
 'jerk',
 'prevent',
 'user',
 'adopt',
 'communitys',
 'code',
 'much',
 'technical',
 'problemits',
 'almost',
 'time',
 'jamesgweenwood',
 'shock',
 'youre',
 'bully',
 'againlionlioneateat',
 'yeah',
 'someone',
 'dmed',
 'screenshot',
 'meh',
 'let',
 'idea',
 'even',
 'isfor',
 'egregious',
 'case',
 'harassment',
 'definitely',
 'need',
 'able',
 'respond',
 'appropriately',
 'toxicity',
 'isnt',
 'thisso',
 'happen',
 'httptcozptrtsyfivnibelsnarfabarf',
 'srhbutts',
 'grimachu',
 'really',
 'funny',
 'assumption',
 'make',
 'work',
 'much',
 'wrongthe',
 'lack',
 'selfawareness',
 'wadhwa',
 'right',
 'stagger',
 'hilarious',
 'stopwadhwahahahaha',
 'httptcojrpkjcnvhthis',
 'go',
 'well',
 'finale',
 'meet',
 'mother',
 'one',
 'way',
 'ano

In [120]:
df[df.clean_tok_tweet.apply(len) > 40].cyberbullying_type.count()

26

In [101]:
def pad_sentence(sentence, length, pad = 'not in voc'):
    diff_lengths = length - len(sentence)
    if diff_lengths > 0:
        return sentence + [pad for i in range(diff_lengths)]
    elif diff_lengths == 0:
        return sentence
    else:
        raise Exception("Not appropriate max final length")

In [34]:
def create_vocabulary(sentences):
    words_registered = []
    words_index = []
    i = 0
    for sentence in sentences:
        for w in sentence:
            if w not in words_registered:
                words_registered.append(w)
                words_index.append(i)
                i+=1
            else:
                continue
    return dict(zip(words_registered, words_index))

vocabulary = create_vocabulary(df.clean_tok_tweet)
vocabulary["not in voc"] = len(vocabulary)
vocab_size = len(vocabulary)

In [36]:
embed_matrix = np.zeros((vocab_size, embedd_len))
for key in vocabulary.keys():
    try:
        embed_matrix[vocabulary[key]] = glove_vectors[key]
    except:
        continue

In [73]:
def MeanEmbedSentence(sentence, KeyedVector, d_embed = 50):
    if len(sentence) > 0:
        vectors = []
        for word in sentence:
            try:
                vectors.append(KeyedVector[word])
            except:
                vectors.append(np.zeros(d_embed))
        return np.array(vectors).mean(axis = 0)
    else:
        return np.zeros(d_embed)

In [89]:
x_gv = df.clean_tok_tweet.apply(lambda x : MeanEmbedSentence(x, glove_vectors, embedd_len))

In [90]:
X_gv = np.array([x for x in x_gv])

In [93]:
X_train_gv, X_test_gv, y_train_gv, y_test_gv = train_test_split(X_gv,
                                                    df["cyberbullying_type"],
                                                    test_size=0.3,
                                                    random_state = 0,
                                                    shuffle=True)

array([[ 0.61393234,  0.09909167, -0.27856333, ...,  0.34815332,
         0.0096385 ,  0.38089   ],
       [ 0.12575366,  0.23154466, -0.02757628, ...,  0.17319603,
        -0.12567182,  0.05321825],
       [ 0.74423122,  0.36952874, -0.19222251, ...,  0.43436208,
         0.08791625,  0.3450079 ],
       ...,
       [ 0.44971201,  0.38231599, -0.1024151 , ...,  0.39961329,
         0.1039636 ,  0.101904  ],
       [ 0.27169004,  0.11556547, -0.08892915, ...,  0.12240506,
        -0.00957426,  0.07988633],
       [ 0.08403   ,  0.52644998, -0.21923   , ..., -0.142985  ,
         0.153145  ,  0.08888   ]])

In [99]:
rf_gv = RandomForestClassifier(n_estimators = 200, random_state = 0)
rf_gv.fit(X_train_gv, y_train_gv)
y_pred_gv = rf_gv.predict(X_test_gv)

In [100]:
print(accuracy_score(y_pred_gv, y_test_gv))

0.7130975677942409
