In [15]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import tree
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim
from sklearn.ensemble import RandomForestClassifier

In [16]:
df = pd.read_csv("cyberbullying_tweets.csv")
Y = df.iloc[::,1].to_numpy()
types = {'age':0,
         'ethnicity':1,
         'gender':2,
         'not_cyberbullying':3,
         'other_cyberbullying':4,
         'religion':5}
Y = [types[y] for y in Y]
Y = np.reshape(Y, (len(Y),1))
X = df.iloc[::,0].to_numpy()
X = [''.join(item.lower() for item in x if item.isalpha() or item == " ") for x in X]
X = [x.split(" ") for x in X] 
#X = [item for sublist in X for item in sublist]

model_w2v = gensim.models.Word2Vec(
            X,
            vector_size=50, # desired no. of features/independent variables
            window=5, # context window size
            min_count=2, # Ignores all words with total frequency lower than 2.                                  
            sg = 1, # 1 for skip-gram model
            hs = 0,
            negative = 10, # for negative sampling
            workers= 32, # no.of cores
            seed = 34) 

model_w2v.train(X, total_examples= len(X), epochs=20)


(17154274, 22638920)

In [17]:
import nltk
nltk.download('wordnet')
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.stem.porter import *

### PARAMETERS ###
# if stemming, else lemmatizing words
use_stemm = True
# dimension vectors needed
dimension_selected = 100

##################
# stop words
list_stp_wrd = stopwords.words('english')

# Init the Wordnet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Init stemmer
stemmer = PorterStemmer()
# set were we get the new dataset with lemmatize words
lemmatize_X = []

#for each sentence (list of word)
for sentence_wrd in X:
    sentence_wrd = [x  for x in sentence_wrd if x not in list_stp_wrd]
    # we lemmatize each wrd
    if use_stemm:
        # stemming if necessary
        lemm_sentence_wrd = [stemmer.stem(wrd) for wrd in sentence_wrd]
    else:
        # else lemmatize
        lemm_sentence_wrd = [lemmatizer.lemmatize(wrd) for wrd in sentence_wrd]
        
    lemmatize_X.append(lemm_sentence_wrd)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/paulloubet/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
# construct a dictionnary
import itertools
all_words = list(itertools.chain.from_iterable(lemmatize_X))


In [20]:
from collections import Counter
# remove empty caracters
all_words = [x for x in all_words if x != '']
# count occurence of each word
dist_counter = Counter(all_words)

# sorted by the most frequents words
sorted_dict = sorted(dist_counter.items(),key=lambda x: x[1] , reverse=True)

In [6]:
# parameters of words wee keep


sorted_dict = sorted_dict[:dimension_selected]

# get the list of most frequent words
mf_words = [x[0] for x in sorted_dict]

In [7]:

BOW = []
for sentence_word in lemmatize_X:
    vector = []
    for word in mf_words :
        # add the number of time the word is in the sentence
        vector.append(sentence_word.count(word))
    # add the vector construct
    BOW.append(vector)

In [8]:
features_data = pd.DataFrame(BOW)

In [9]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += model_w2v.wv[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec
wordvec_arrays = np.zeros((len(X), 50)) 
for i in range(len(X)):
    wordvec_arrays[i,:] = word_vector(X[i], 50)

"""
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape
"""

'\nwordvec_df = pd.DataFrame(wordvec_arrays)\nwordvec_df.shape\n'

In [10]:
class Tree():
    def __init__(self, max_depth, min_samples_split=2, min_samples_leaf=1, splitting=10):
        """Tree Descision Classifier.

        Args:
            max_depth (int): The maximum depth of the tree.
            
            min_samples_split (int): The minimum number of samples
                required to split an internal node.
                
            min_samples_leaf (int): The minimum number of samples
                required to be at a leaf node
        """
        assert max_depth >= 1, "max_depth must be greater or equal than 1"
        assert min_samples_split >= 2, "min_samples_split must be greater or equal than 2"
        assert min_samples_leaf >= 1, "min_samples_leaf must be greater or equal than 1"
        
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.splitting = splitting
        self.nodes = {"root": {}}
        
    def gini_index(self, sub, m):
        n = len(sub)
        proportions = sum([((sub[:,-1] == x).sum()/n)**2 for x in np.unique(sub[:,-1])])
        return (1-proportions) * (n/m)
        
        
    def get_split(self, X, depth, node):
        
        m = len(X)
        
        if depth != 0 and m >= self.min_samples_split:
            
            best_split = None
            best_feature = None
            best_value = float("inf")
            
            for feature in range(len(X[0]) - 1):
                if self.splitting is None:
                    uniques = np.unique(X[:,feature])
                else:
                    all_sorted = sorted(X[:,feature])
                    batch = len(all_sorted) // self.splitting
                    uniques = [all_sorted[i*batch] for i in range(self.splitting)]
                for split in uniques:
                    A, B = X[X[:,feature] <= split], X[X[:,feature] > split]
                    if len(A) >= self.min_samples_leaf and len(B) >= self.min_samples_leaf:
                        value = self.gini_index(A, m) + self.gini_index(B, m)
                        if value < best_value:
                            best_value = value
                            best_feature = feature
                            best_split = split
            
            if best_feature is not None:
                A, B = X[X[:,best_feature] <= best_split], X[X[:,best_feature] > best_split]
                node["feature"] = best_feature
                node["split"] = best_split
                node["A"] = {}
                node["B"] = {}
                node["class_A"] = np.unique(A[:,-1])[np.argmax([(A[:,-1] == x).sum() for x in np.unique(A[:,-1])])]
                node["class_B"] = np.unique(B[:,-1])[np.argmax([(B[:,-1] == x).sum() for x in np.unique(B[:,-1])])]
                self.get_split(A, depth-1, node["A"])
                self.get_split(B, depth-1, node["B"])
                
    
    def fit(self, X, y):
        X = np.append(X,y, axis=1)
        self.get_split(X, self.max_depth, self.nodes["root"])
        
    def predict(self, X):
        node = self.nodes["root"]
        while True:
            if X[node["feature"]] <= node["split"]:
                if not node["A"]:
                    return node["class_A"]
                else:
                    node = node["A"]
            else:
                if not node["B"]:
                    return node["class_B"]
                else:
                    node = node["B"]
    
    def score(self, X, Y):
        count = 0
        for x, y in zip(X,Y):
            if self.predict(x) == y: count += 1
        return count/len(X)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(features_data, Y, test_size=0.30, random_state=0)
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

clf = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 6, min_samples_leaf= 1, random_state=42)
clf.fit(X_train, y_train)
print("sk-learn model score: ", clf.score(X_test, y_test))

model = Tree(max_depth=10, min_samples_split = 6, min_samples_leaf= 1, splitting=10)
model.fit(X_train, y_train)
print("Homemade model score: ", model.score(X_test, y_test))

sk-learn model score:  0.7042913055633212
Homemade model score:  0.7045708694436679


In [12]:
class RandomForest():
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators
        
    def fit(self, X, y):
        self.all_trees = []
        sub_size = round(len(X)*(2/3))
        for i in range(self.n_estimators):
            id = np.random.randint(0,len(X),sub_size)
            subX = np.array(X)[id]
            suby = y[id.astype(int)]
            t = Tree(10)
            t.fit(subX, suby)
            self.all_trees.append(t)
            
    def predict(self, X):
        predictions = []
        for t in self.all_trees:
            predictions.append(t.predict(X))
        return max(set(predictions), key=predictions.count)

    def score(self, X, Y):
        count = 0
        for x, y in zip(X,Y):
            if self.predict(x) == y: count += 1
        return count/len(X)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(features_data, Y, test_size=0.30, random_state=0)
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train, y_train)
print("sk-learn model score: ", clf.score(X_test, y_test))

model = RandomForest(10)
model.fit(X_train, y_train)
print("Homemade model score: ", model.score(X_test, y_test))

  clf.fit(X_train, y_train)


sk-learn model score:  0.788160469667319
Homemade model score:  0.7054095610847079
