# Decision tree Classification

## Preperation 

### Importing needed libraries

In [2]:
import sys
  
# setting path
sys.path.append('../data/')

In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import preprocessing as pre

  from .autonotebook import tqdm as notebook_tqdm
  demoji.download_codes()
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/aaronsteiner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aaronsteiner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/aaronsteiner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Grid search

### Test data loading

In [5]:
tfidf, df_train, df_test = pre.setup(rem_stop=False, do_stem=False, do_lem=True, upsample=True, do_emojis=False)


Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/Users/aaronsteiner/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)
100%|██████████| 1/1 [00:00<00:00, 484.95it/s]


There is 25569 training data, of which 7.02% is hate speech 
There is 6393 test data, of which 7.01% is hate speech 


In [7]:
df_train.head()

Unnamed: 0,label,tweet,preprocessed
30405,0,when everyone's free when you're in exam mode ...,"[when, everyones, free, when, youre, in, exam,..."
27807,0,#jacksonville rooster simulation: i want to ...,"[jacksonville, rooster, simulation, i, want, t..."
8660,0,@user just run 10kms for @user @user #loveis...,"[user, just, run, 10kms, for, user, user, love..."
19185,0,@user got the prototype for our new usb today!...,"[user, got, the, prototype, for, our, new, usb..."
10355,0,have a &amp; #healthy #fathersday. #runnerda...,"[have, a, amp, healthy, fathersday, runnerdad,..."


### Setup training function

In [2]:
def train_model(df_train: pd.DataFrame, tfidf: TfidfVectorizer):
    tree = DecisionTreeClassifier(random_state=55)

    pipe = Pipeline(steps=[('dec_tree', tree)])

    Xt_train = tfidf.transform(df_train['preprocessed'])
    y_train = df_train['label']
    
    criterion = ['gini', 'entropy']
    max_depth = [i for i in range(200, 400, 20)] #-> tried 100 - 280 but not a single model used values bellow 200
    #Some models used 280 therefore boost to 400 was tried 
    min_samples_split = [i for i in range(2, 20, 2)]
    min_samples_leaf = [i for i in range(1, 3)]
    #min_samples_leaf = [i for i in range(1, 10)] -> was tried but all models used 1 or 2
    class_weight = [None] #-> balanced yields f1 bellow .50

    parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth, dec_tree__min_samples_split=min_samples_split,
                      dec_tree__min_samples_leaf=min_samples_leaf, dec_tree__class_weight=class_weight)

    dec_tree = GridSearchCV(pipe, param_grid=parameters, scoring='f1', n_jobs=-1)
    dec_tree.fit(Xt_train, y_train)

    return dec_tree.best_estimator_


### Setup testing function

In [3]:
def test_model(model, df_test: pd.DataFrame, tfidf: TfidfVectorizer):
    Xt_test = tfidf.transform(df_test['preprocessed'])
    y_test = df_test['label']
    y_pred = model.predict(Xt_test)

    predictions = []

    predictions.append(model.get_params())
    predictions.append(metrics.precision_score(y_test, y_pred))
    predictions.append(metrics.recall_score(y_test, y_pred))
    predictions.append(metrics.accuracy_score(y_test, y_pred))
    predictions.append(metrics.f1_score(y_test, y_pred))

    return predictions

### Setup result list

In [10]:
results = []

### Only Tokenization

In [5]:
tfidf, df_train, df_test = pre.setup(rem_stop=False, do_stem=False, do_lem=False, upsample=False, do_emojis=False)
model = train_model(df_train, tfidf)
results.append("Only Tokenization \n")
results.append(test_model(model, df_test, tfidf))
results.append("\n\n")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/home/jovyan/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)
100%|██████████| 1/1 [00:00<00:00, 437.09it/s]


### Remove Stopwords

In [6]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=False, do_lem=False, upsample=False, do_emojis=False)
model = train_model(df_train, tfidf)
results.append("\n\nRemove Stopwords \n")
results.append(test_model(model, df_test, tfidf))
results.append("\n\n")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/home/jovyan/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)
100%|██████████| 1/1 [00:00<00:00, 668.52it/s]


### Emojis

In [7]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=False, do_lem=False, upsample=False, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nEmojis \n")
results.append(test_model(model, df_test, tfidf))
results.append("\n\n")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/home/jovyan/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)
100%|██████████| 1/1 [00:00<00:00, 783.25it/s]


### Stemming

In [8]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=True, do_lem=False, upsample=False, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nStemming \n")
results.append(test_model(model, df_test, tfidf))
results.append("\n\n")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/home/jovyan/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)
100%|██████████| 1/1 [00:00<00:00, 714.65it/s]


### Upsampling

In [9]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=True, do_lem=False, upsample=True, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nUpsampling \n")
results.append(test_model(model, df_test, tfidf))
results.append("\n\n")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/home/jovyan/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)
100%|██████████| 1/1 [00:00<00:00, 711.86it/s]


### All-but-Stemming

In [10]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=False, do_lem=False, upsample=True, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nAll-but-Stemming \n")
results.append(test_model(model, df_test, tfidf))
results.append("\n\n")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/home/jovyan/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c6b6f41e91ac9113e1c032c5ecf7a49b4e1e9dc8699ded3c2d8425c9217568b2)
100%|██████████| 1/1 [00:00<00:00, 718.33it/s]


### Export results to file

In [16]:
textfile = open("results_descision_tree.txt", "w")
for element in results:
    if not isinstance(element, str):
        for subelement in element:
            textfile.write(str(subelement) + "\n")
        continue
    textfile.write(str(element) + "\n")
textfile.close()