In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import tree
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
df = pd.read_csv("cyberbullying_tweets.csv")
Y = df.iloc[::,1].to_numpy()
types = {'age':0,
         'ethnicity':1,
         'gender':2,
         'not_cyberbullying':3,
         'other_cyberbullying':4,
         'religion':5}
Y = [types[y] for y in Y]
Y = np.reshape(Y, (len(Y),1))
X = df.iloc[::,0].to_numpy()
X = [''.join(item.lower() for item in x if item.isalpha() or item == " ") for x in X]
X = [x.split(" ") for x in X] 
#X = [item for sublist in X for item in sublist]

cbow = gensim.models.Word2Vec(
            X,
            size=50, # desired no. of features/independent variables
            window=8, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.                                  
            sg = 0, # 0 for cbow
            hs = 0, # to enable negative sampling
            negative = 10, # for negative sampling
            cbow_mean = 1, # use the mean instead of the sum
            workers= 32, # no.of cores
            seed = 34) 

cbow.train(X, total_examples= len(X), epochs=20)


(17152193, 22638920)

In [None]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += cbow.wv[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec
wordvec_arrays = np.zeros((len(X), 50)) 
for i in range(len(X)):
    wordvec_arrays[i,:] = word_vector(X[i], 50)
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(47692, 50)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(wordvec_df, Y, test_size=0.30, random_state=0)
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

clf = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 6, min_samples_leaf= 1, random_state=42)
clf.fit(X_train, y_train)
print("sk-learn model score: ", clf.score(X_test, y_test))


sk-learn model score:  0.6437657254682695


In [None]:
cbow_dt = Pipeline([
                    ('dt', tree.DecisionTreeClassifier())])


parameters = {
              'dt__max_depth': [5, 10, 15],
              'dt__min_samples_split': [3, 6, 10],
              'dt__min_samples_leaf': [1, 4, 8]}

grid_DT = GridSearchCV(cbow_dt, parameters, verbose=0, return_train_score=True)
grid_DT = grid_DT.fit(X_train, y_train)


print("Best DT model score: ", grid_DT.score(X_test, y_test))

Best DT model score:  0.6456527816606095


In [None]:
grid_DT.best_estimator_.get_params()

{'dt': DecisionTreeClassifier(max_depth=10, min_samples_split=10),
 'dt__ccp_alpha': 0.0,
 'dt__class_weight': None,
 'dt__criterion': 'gini',
 'dt__max_depth': 10,
 'dt__max_features': None,
 'dt__max_leaf_nodes': None,
 'dt__min_impurity_decrease': 0.0,
 'dt__min_samples_leaf': 1,
 'dt__min_samples_split': 10,
 'dt__min_weight_fraction_leaf': 0.0,
 'dt__random_state': None,
 'dt__splitter': 'best',
 'memory': None,
 'steps': [('dt', DecisionTreeClassifier(max_depth=10, min_samples_split=10))],
 'verbose': False}

In [None]:
results_df = pd.DataFrame(grid_DT.cv_results_)
results_df = results_df.sort_values(by=["rank_test_score"])
results_df = results_df.set_index(
    results_df["params"].apply(lambda x: "_".join(str(val) for val in x.values()))
).rename_axis("kernel")
results_df[["params", "rank_test_score", "mean_test_score", "std_test_score"]]

Unnamed: 0_level_0,params,rank_test_score,mean_test_score,std_test_score
kernel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10_1_10,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",1,0.649053,0.003798
10_1_3,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",2,0.648754,0.00404
10_4_6,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",3,0.648664,0.004225
10_1_6,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",4,0.648634,0.00352
10_4_10,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",5,0.648544,0.004032
10_4_3,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",6,0.648514,0.003931
10_8_10,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",7,0.647316,0.003413
10_8_3,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",8,0.647256,0.003349
10_8_6,"{'dt__max_depth': 10, 'dt__min_samples_leaf': ...",9,0.647166,0.003641
15_8_3,"{'dt__max_depth': 15, 'dt__min_samples_leaf': ...",10,0.642913,0.003011


In [None]:


clf = RandomForestClassifier()



In [None]:
RF = Pipeline([('clf', RandomForestClassifier())])


parameters = {'clf__n_estimators': [5, 10, 15, 20]}

grid_RF = GridSearchCV(RF, parameters, verbose=0, return_train_score=True)
grid_RF = grid_RF.fit(X_train, y_train)

print("\n\n")
print("Best RF model score: ", grid_RF.score(X_test, y_test))
print("\n\n")
grid_RF.best_estimator_.get_params()

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_ste

Best RF model score:  0.7323874755381604





{'clf': RandomForestClassifier(n_estimators=20),
 'clf__bootstrap': True,
 'clf__ccp_alpha': 0.0,
 'clf__class_weight': None,
 'clf__criterion': 'gini',
 'clf__max_depth': None,
 'clf__max_features': 'auto',
 'clf__max_leaf_nodes': None,
 'clf__max_samples': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 2,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__n_estimators': 20,
 'clf__n_jobs': None,
 'clf__oob_score': False,
 'clf__random_state': None,
 'clf__verbose': 0,
 'clf__warm_start': False,
 'memory': None,
 'steps': [('clf', RandomForestClassifier(n_estimators=20))],
 'verbose': False}

In [None]:
from sklearn.ensemble import BaggingClassifier


bag = BaggingClassifier(tree.DecisionTreeClassifier()) #, max_samples=0.5, n_estimators=100)



BAG = Pipeline([
               ('bag', bag)])


parameters = {'bag__n_estimators': [5, 10, 15, 20]}

grid_BAG = GridSearchCV(BAG, parameters, verbose=0, return_train_score=True)
grid_BAG = grid_BAG.fit(X_train, y_train)

print("\n\n")
print("Best BAGGING model score: ", grid_BAG.score(X_test, y_test))
print("\n\n")
grid_BAG.best_estimator_.get_params()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)





Best BAGGING model score:  0.723930668157674





{'bag': BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=20),
 'bag__base_estimator': DecisionTreeClassifier(),
 'bag__base_estimator__ccp_alpha': 0.0,
 'bag__base_estimator__class_weight': None,
 'bag__base_estimator__criterion': 'gini',
 'bag__base_estimator__max_depth': None,
 'bag__base_estimator__max_features': None,
 'bag__base_estimator__max_leaf_nodes': None,
 'bag__base_estimator__min_impurity_decrease': 0.0,
 'bag__base_estimator__min_samples_leaf': 1,
 'bag__base_estimator__min_samples_split': 2,
 'bag__base_estimator__min_weight_fraction_leaf': 0.0,
 'bag__base_estimator__random_state': None,
 'bag__base_estimator__splitter': 'best',
 'bag__bootstrap': True,
 'bag__bootstrap_features': False,
 'bag__max_features': 1.0,
 'bag__max_samples': 1.0,
 'bag__n_estimators': 20,
 'bag__n_jobs': None,
 'bag__oob_score': False,
 'bag__random_state': None,
 'bag__verbose': 0,
 'bag__warm_start': False,
 'memory': None,
 'steps': [('bag',
   BaggingClassifier(base

In [None]:
from sklearn.ensemble import AdaBoostClassifier



ada = AdaBoostClassifier(tree.DecisionTreeClassifier()) #, max_samples=0.5, n_estimators=100)

#ada = AdaBoostClassifier(clf, n_estimators=10, learning_rate=0.1)

ADA = Pipeline([
               ('ada', ada)])


parameters = {'ada__n_estimators': [5, 10, 15]}

grid_ADA = GridSearchCV(ADA, parameters, verbose=0, return_train_score=True)
grid_ADA = grid_ADA.fit(X_train, y_train)

print("\n\n")
print("Best AdaBoost model score: ", grid_ADA.score(X_test, y_test))
print("\n\n")
grid_ADA.best_estimator_.get_params()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)





Best AdaBoost model score:  0.709742801230081





{'ada': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=15),
 'ada__algorithm': 'SAMME.R',
 'ada__base_estimator': DecisionTreeClassifier(),
 'ada__base_estimator__ccp_alpha': 0.0,
 'ada__base_estimator__class_weight': None,
 'ada__base_estimator__criterion': 'gini',
 'ada__base_estimator__max_depth': None,
 'ada__base_estimator__max_features': None,
 'ada__base_estimator__max_leaf_nodes': None,
 'ada__base_estimator__min_impurity_decrease': 0.0,
 'ada__base_estimator__min_samples_leaf': 1,
 'ada__base_estimator__min_samples_split': 2,
 'ada__base_estimator__min_weight_fraction_leaf': 0.0,
 'ada__base_estimator__random_state': None,
 'ada__base_estimator__splitter': 'best',
 'ada__learning_rate': 1.0,
 'ada__n_estimators': 15,
 'ada__random_state': None,
 'memory': None,
 'steps': [('ada',
   AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=15))],
 'verbose': False}

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


GBC = Pipeline([
               ('gbc', GradientBoostingClassifier())])


parameters = {'gbc__n_estimators': [5, 10, 15]}

grid_GBC = GridSearchCV(GBC, parameters, verbose=0, return_train_score=True)
grid_GBC = grid_GBC.fit(X_train, y_train)

print("\n\n")
print("Best AdaBoost model score: ", grid_GBC.score(X_test, y_test))
print("\n\n")
grid_GBC.best_estimator_.get_params()

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)





Best AdaBoost model score:  0.7002376292982947





{'gbc': GradientBoostingClassifier(n_estimators=15),
 'gbc__ccp_alpha': 0.0,
 'gbc__criterion': 'friedman_mse',
 'gbc__init': None,
 'gbc__learning_rate': 0.1,
 'gbc__loss': 'deviance',
 'gbc__max_depth': 3,
 'gbc__max_features': None,
 'gbc__max_leaf_nodes': None,
 'gbc__min_impurity_decrease': 0.0,
 'gbc__min_samples_leaf': 1,
 'gbc__min_samples_split': 2,
 'gbc__min_weight_fraction_leaf': 0.0,
 'gbc__n_estimators': 15,
 'gbc__n_iter_no_change': None,
 'gbc__random_state': None,
 'gbc__subsample': 1.0,
 'gbc__tol': 0.0001,
 'gbc__validation_fraction': 0.1,
 'gbc__verbose': 0,
 'gbc__warm_start': False,
 'memory': None,
 'steps': [('gbc', GradientBoostingClassifier(n_estimators=15))],
 'verbose': False}