In [2]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import tree
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv("cyberbullying_tweets.csv")
Y = df.iloc[::,1].to_numpy()
types = {'age':0,
         'ethnicity':1,
         'gender':2,
         'not_cyberbullying':3,
         'other_cyberbullying':4,
         'religion':5}
Y = [types[y] for y in Y]
Y = np.reshape(Y, (len(Y),1))
X = df.iloc[::,0].to_numpy()
X = [''.join(item.lower() for item in x if item.isalpha() or item == " ") for x in X]
X = [x.split(" ") for x in X] 
#X = [item for sublist in X for item in sublist]

cbow = gensim.models.Word2Vec(
            X,
            size=50, # desired no. of features/independent variables
            window=8, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.                                  
            sg = 0, # 0 for cbow
            hs = 0, # to enable negative sampling
            negative = 10, # for negative sampling
            cbow_mean = 1, # use the mean instead of the sum
            workers= 32, # no.of cores
            seed = 34) 

cbow.train(X, total_examples= len(X), epochs=20)


(17152847, 22638920)

In [4]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += cbow.wv[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec
wordvec_arrays = np.zeros((len(X), 50)) 
for i in range(len(X)):
    wordvec_arrays[i,:] = word_vector(X[i], 50)
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(47692, 50)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(wordvec_df, Y, test_size=0.30, random_state=0)
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

clf = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 6, min_samples_leaf= 1, random_state=42)
clf.fit(X_train, y_train)
print("sk-learn model score: ", clf.score(X_test, y_test))


sk-learn model score:  0.646561364271736


In [6]:
def display_top_ten(results):

  results_df = pd.DataFrame(results)
  results_df = results_df.sort_values(by=["rank_test_score"])
  results_df = results_df.set_index(
      results_df["params"].apply(lambda x: "_".join(str(val) for val in x.values()))
      ).rename_axis("kernel")
  top = results_df[[ "rank_test_score", "mean_test_score", "std_test_score"]]

  print(top[:10])



In [7]:
dt = tree.DecisionTreeClassifier(random_state=42)
DT = Pipeline([('dt', dt)])


parameters = {
              'dt__max_depth': [5, 10, 15],
              'dt__min_samples_split': [3, 6, 10],
              'dt__min_samples_leaf': [1, 4, 8]
              }

grid_DT = GridSearchCV(DT, parameters, verbose=0, return_train_score=True)
grid_DT = grid_DT.fit(X_train, y_train)


best_par = grid_DT.best_estimator_.get_params()

print("Best model: ",
      '\n Max Depth = ', best_par['dt__max_depth'],
      '\n Min Samples Split = ', best_par['dt__min_samples_split'],
      '\n Min Samples Leaf = ', best_par['dt__min_samples_leaf']
      )

best_score = grid_DT.score(X_test, y_test)
print("Best DT model score: ", best_score)

results = grid_DT.cv_results_
print("TOP 10 Models: \n")
display_top_ten(results)

Best model:  
 Max Depth =  10 
 Min Samples Split =  3 
 Min Samples Leaf =  4
Best DT model score:  0.6467710371819961
TOP 10 Models: 

         rank_test_score  mean_test_score  std_test_score
kernel                                                   
10_4_6                 1         0.637581        0.004405
10_4_3                 1         0.637581        0.004405
10_1_10                3         0.636952        0.004729
10_1_3                 4         0.636742        0.004094
10_4_10                5         0.636532        0.004512
15_8_3                 6         0.636322        0.006316
15_8_6                 6         0.636322        0.006316
15_8_10                6         0.636322        0.006316
10_1_6                 9         0.636023        0.004690
10_8_10               10         0.635963        0.005283


In [8]:
rf = RandomForestClassifier()

RF = Pipeline([('rf', rf)])

parameters = {
              'rf__n_estimators': [5, 10]#, 15, 20]
              }

grid_RF = GridSearchCV(RF, parameters, verbose=0, return_train_score=True)
grid_RF = grid_RF.fit(X_train, y_train.ravel())


best_par = grid_RF.best_estimator_.get_params()

print("Best model: ",
      '\n Number of Estimators = ', best_par['rf__n_estimators']
      )

best_score = grid_RF.score(X_test, y_test.ravel())
print("Best RF model score: ", best_score)

results = grid_RF.cv_results_
print("TOP 10 Models: \n")
display_top_ten(results)

Best model:  
 Number of Estimators =  10
Best RF model score:  0.7121190942130277
TOP 10 Models: 

        rank_test_score  mean_test_score  std_test_score
kernel                                                  
10                    1         0.707525        0.005723
5                     2         0.671460        0.005174


In [9]:
from sklearn.ensemble import BaggingClassifier


bag = BaggingClassifier(tree.DecisionTreeClassifier()) #, max_samples=0.5, n_estimators=100)

BAG = Pipeline([
               ('bag', bag)])

parameters = {'bag__n_estimators': [5, 10, 15, 20]}

grid_BAG = GridSearchCV(BAG, parameters, verbose=0, return_train_score=True)
grid_BAG = grid_BAG.fit(X_train, y_train.ravel())

best_par = grid_BAG.best_estimator_.get_params()

print("Best model: ",
      '\n Number of Estimators = ', best_par['bag__n_estimators']
      )

best_score = grid_BAG.score(X_test, y_test.ravel())
print("Best BAG model score: ", best_score)

results = grid_BAG.cv_results_
print("TOP 10 Models: \n")
display_top_ten(results)

Best model:  
 Number of Estimators =  20
Best BAG model score:  0.7172211350293543
TOP 10 Models: 

        rank_test_score  mean_test_score  std_test_score
kernel                                                  
20                    1         0.721723        0.002221
15                    2         0.713426        0.002094
10                    3         0.699587        0.003836
5                     4         0.670471        0.001834


In [10]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(tree.DecisionTreeClassifier()) 

ADA = Pipeline([
               ('ada', ada)])


parameters = {
              'ada__n_estimators': [5, 10, 15],
              'ada__learning_rate' : [0.0001, 0.001, 0.1]
              }

grid_ADA = GridSearchCV(ADA, parameters, verbose=0, return_train_score=True)
grid_ADA = grid_ADA.fit(X_train, y_train.ravel())


best_par = grid_ADA.best_estimator_.get_params()

print("Best model: ",
      '\n Number of Estimators = ', best_par['ada__n_estimators'],
      '\n Learning Rate = ', best_par['ada__learning_rate']
      )

best_score = grid_ADA.score(X_test, y_test.ravel())
print("Best AdaBoost model score: ", best_score)

results = grid_ADA.cv_results_
print("TOP 10 Models: \n")
display_top_ten(results)

Best model:  
 Number of Estimators =  15 
 Learning Rate =  0.1
Best AdaBoost model score:  0.715403969807101
TOP 10 Models: 

           rank_test_score  mean_test_score  std_test_score
kernel                                                     
0.1_15                   1         0.717050        0.004917
0.1_10                   2         0.706087        0.003312
0.1_5                    3         0.675174        0.005082
0.001_15                 4         0.621854        0.006746
0.001_10                 5         0.621735        0.009139
0.0001_15                6         0.620237        0.006861
0.001_5                  7         0.618979        0.008762
0.0001_10                8         0.618320        0.007437
0.0001_5                 9         0.617092        0.007179


In [11]:
from sklearn.ensemble import GradientBoostingClassifier


GBC = Pipeline([
               ('gbc', GradientBoostingClassifier())])


parameters = {
              'gbc__n_estimators': [5, 10, 15],
              'gbc__learning_rate' : [0.001, 0.1]              
              }

grid_GBC = GridSearchCV(GBC, parameters, verbose=0, return_train_score=True)
grid_GBC = grid_GBC.fit(X_train, y_train.ravel())


best_par = grid_GBC.best_estimator_.get_params()

print("Best Gradient Boosting Classifier model: ",
      '\n Number of Estimators = ', best_par['gbc__n_estimators'],
      '\n Learning Rate = ', best_par['gbc__learning_rate']
      )

best_score = grid_GBC.score(X_test, y_test.ravel())
print("Best Gradient Boosting Classifier model score: ", best_score)

results = grid_GBC.cv_results_
print("TOP 10 Models: \n")
display_top_ten(results)

Best Gradient Boosting Classifier model:  
 Number of Estimators =  15 
 Learning Rate =  0.1
Best Gradient Boosting Classifier model score:  0.6922001677383282
TOP 10 Models: 

           rank_test_score  mean_test_score  std_test_score
kernel                                                     
0.1_15                   1         0.689702        0.004033
0.1_10                   2         0.676312        0.004801
0.1_5                    3         0.655613        0.004680
0.001_15                 4         0.609633        0.005062
0.001_10                 5         0.592739        0.005289
0.001_5                  6         0.528756        0.006894
0.0001_15                7         0.253834        0.003191
0.0001_10                8         0.246555        0.003748
0.0001_5                 9         0.168045        0.000010
