# Decision tree Classification

## Preperation 

### Importing needed libraries

In [None]:
import sys
  
# setting path
sys.path.append('../data/')

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import preprocessing as pre

# Grid search

### Test data loading

In [None]:
tfidf, df_train, df_test = pre.setup(rem_stop=False, do_stem=False, do_lem=True, upsample=True, do_emojis=False)


In [None]:
df_train.head()

### Setup training function

In [None]:
def train_model(df_train: pd.DataFrame, tfidf: TfidfVectorizer):
    tree = DecisionTreeClassifier(random_state=55)

    pipe = Pipeline(steps=[('dec_tree', tree)])

    Xt_train = tfidf.transform(df_train['preprocessed'])
    y_train = df_train['label']
    
    criterion = ['gini', 'entropy']
    max_depth = [i for i in range(200, 400, 20)] #-> tried 100 - 280 but not a single model used values bellow 200
    #Some models used 280 therefore boost to 400 was tried 
    min_samples_split = [i for i in range(2, 20, 2)]
    min_samples_leaf = [i for i in range(1, 5)]
    #min_samples_leaf = [i for i in range(1, 10)] -> was tried but all models used 1 or 2
    class_weight = [None] #-> balanced yields f1 bellow .50

    parameters = dict(dec_tree__criterion=criterion,
                      dec_tree__max_depth=max_depth, dec_tree__min_samples_split=min_samples_split,
                      dec_tree__min_samples_leaf=min_samples_leaf, dec_tree__class_weight=class_weight)

    dec_tree = GridSearchCV(pipe, param_grid=parameters, scoring='f1', n_jobs=-1)
    dec_tree.fit(Xt_train, y_train)

    return dec_tree.best_estimator_


### Setup testing function

In [None]:
def test_model(model, df_test: pd.DataFrame, tfidf: TfidfVectorizer):
    Xt_test = tfidf.transform(df_test['preprocessed'])
    y_test = df_test['label']
    y_pred = model.predict(Xt_test)

    predictions = []

    predictions.append(model.get_params())
    predictions.append(f"Precision: {metrics.precision_score(y_test, y_pred)}")
    predictions.append(f"Recall: {metrics.recall_score(y_test, y_pred)}")
    predictions.append(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
    predictions.append(f"F1: {metrics.f1_score(y_test, y_pred)}")

    return predictions

### Setup result list

In [None]:
results = []

### Only Tokenization

In [None]:
tfidf, df_train, df_test = pre.setup(rem_stop=False, do_stem=False, do_lem=False, upsample=False, do_emojis=False)
model = train_model(df_train, tfidf)
results.append("Only Tokenization \n")
results.append(test_model(model, df_test, tfidf))

### Remove Stopwords

In [None]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=False, do_lem=False, upsample=False, do_emojis=False)
model = train_model(df_train, tfidf)
results.append("\n\nRemove Stopwords \n")
results.append(test_model(model, df_test, tfidf))

### Emojis

In [None]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=False, do_lem=False, upsample=False, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nEmojis \n")
results.append(test_model(model, df_test, tfidf))

### Stemming

In [None]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=True, do_lem=False, upsample=False, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nStemming \n")
results.append(test_model(model, df_test, tfidf))

### Upsampling

In [None]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=True, do_lem=False, upsample=True, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nUpsampling \n")
results.append(test_model(model, df_test, tfidf))

### All-but-Stemming

In [None]:
tfidf, df_train, df_test = pre.setup(rem_stop=True, do_stem=False, do_lem=False, upsample=True, do_emojis=True)
model = train_model(df_train, tfidf)
results.append("\n\nAll-but-Stemming \n")
results.append(test_model(model, df_test, tfidf))

### Export results to file

In [None]:
textfile = open("results_descision_tree.txt", "w")
for element in results:
    if not isinstance(element, str):
        for subelement in element:
            textfile.write(str(subelement) + "\n")
        continue
    textfile.write(str(element) + "\n")
textfile.close()

### Results

Only Tokenization 

{'memory': None, 'steps': [('dec_tree', DecisionTreeClassifier(max_depth=260, min_samples_split=10, random_state=55))], 'verbose': False, 'dec_tree': DecisionTreeClassifier(max_depth=260, min_samples_split=10, random_state=55), 'dec_tree__ccp_alpha': 0.0, 'dec_tree__class_weight': None, 'dec_tree__criterion': 'gini', 'dec_tree__max_depth': 260, 'dec_tree__max_features': None, 'dec_tree__max_leaf_nodes': None, 'dec_tree__min_impurity_decrease': 0.0, 'dec_tree__min_samples_leaf': 1, 'dec_tree__min_samples_split': 10, 'dec_tree__min_weight_fraction_leaf': 0.0, 'dec_tree__random_state': 55, 'dec_tree__splitter': 'best'}<br><br>
Precision: 0.6286472148541115<br>
Recall: 0.5290178571428571<br>
Accuracy: 0.9450961989676209<br>
F1: 0.5745454545454545<br>


Remove Stopwords 

{'memory': None, 'steps': [('dec_tree', DecisionTreeClassifier(max_depth=300, min_samples_split=4, random_state=55))], 'verbose': False, 'dec_tree': DecisionTreeClassifier(max_depth=300, min_samples_split=4, random_state=55), 'dec_tree__ccp_alpha': 0.0, 'dec_tree__class_weight': None, 'dec_tree__criterion': 'gini', 'dec_tree__max_depth': 300, 'dec_tree__max_features': None, 'dec_tree__max_leaf_nodes': None, 'dec_tree__min_impurity_decrease': 0.0, 'dec_tree__min_samples_leaf': 1, 'dec_tree__min_samples_split': 4, 'dec_tree__min_weight_fraction_leaf': 0.0, 'dec_tree__random_state': 55, 'dec_tree__splitter': 'best'}<br><br>
Precision: 0.6370558375634517<br>
Recall: 0.5602678571428571<br>
Accuracy: 0.9468168309088065<br>
F1: 0.5961995249406175<br>


Emojis 

{'memory': None, 'steps': [('dec_tree', DecisionTreeClassifier(max_depth=200, min_samples_split=4, random_state=55))], 'verbose': False, 'dec_tree': DecisionTreeClassifier(max_depth=200, min_samples_split=4, random_state=55), 'dec_tree__ccp_alpha': 0.0, 'dec_tree__class_weight': None, 'dec_tree__criterion': 'gini', 'dec_tree__max_depth': 200, 'dec_tree__max_features': None, 'dec_tree__max_leaf_nodes': None, 'dec_tree__min_impurity_decrease': 0.0, 'dec_tree__min_samples_leaf': 1, 'dec_tree__min_samples_split': 4, 'dec_tree__min_weight_fraction_leaf': 0.0, 'dec_tree__random_state': 55, 'dec_tree__splitter': 'best'}

Precision: 0.6469002695417789<br>
Recall: 0.5357142857142857<br>
Accuracy: 0.9469732519943689<br>
F1: 0.586080586080586<br>


Stemming 

{'memory': None, 'steps': [('dec_tree', DecisionTreeClassifier(max_depth=260, min_samples_split=8, random_state=55))], 'verbose': False, 'dec_tree': DecisionTreeClassifier(max_depth=260, min_samples_split=8, random_state=55), 'dec_tree__ccp_alpha': 0.0, 'dec_tree__class_weight': None, 'dec_tree__criterion': 'gini', 'dec_tree__max_depth': 260, 'dec_tree__max_features': None, 'dec_tree__max_leaf_nodes': None, 'dec_tree__min_impurity_decrease': 0.0, 'dec_tree__min_samples_leaf': 1, 'dec_tree__min_samples_split': 8, 'dec_tree__min_weight_fraction_leaf': 0.0, 'dec_tree__random_state': 55, 'dec_tree__splitter': 'best'}<br><br>
Precision: 0.6419753086419753<br>
Recall: 0.5803571428571429<br>
Accuracy: 0.9479117785077429<br>
F1: 0.6096131301289566<br>


Upsampling 

{'memory': None, 'steps': [('dec_tree', DecisionTreeClassifier(criterion='entropy', max_depth=200, min_samples_leaf=2,
                       min_samples_split=6, random_state=55))], 'verbose': False, 'dec_tree': DecisionTreeClassifier(criterion='entropy', max_depth=200, min_samples_leaf=2,
                       min_samples_split=6, random_state=55), 'dec_tree__ccp_alpha': 0.0, 'dec_tree__class_weight': None, 'dec_tree__criterion': 'entropy', 'dec_tree__max_depth': 200, 'dec_tree__max_features': None, 'dec_tree__max_leaf_nodes': None, 'dec_tree__min_impurity_decrease': 0.0, 'dec_tree__min_samples_leaf': 2, 'dec_tree__min_samples_split': 6, 'dec_tree__min_weight_fraction_leaf': 0.0, 'dec_tree__random_state': 55, 'dec_tree__splitter': 'best'}<br><br>
Precision: 0.5952380952380952<br>
Recall: 0.5022321428571429<br>
Accuracy: 0.9411856718285625<br>
F1: 0.5447941888619855<br>


All-but-Stemming 

{'memory': None, 'steps': [('dec_tree', DecisionTreeClassifier(criterion='entropy', max_depth=220, min_samples_leaf=2,
                       min_samples_split=8, random_state=55))], 'verbose': False, 'dec_tree': DecisionTreeClassifier(criterion='entropy', max_depth=220, min_samples_leaf=2,
                       min_samples_split=8, random_state=55), 'dec_tree__ccp_alpha': 0.0, 'dec_tree__class_weight': None, 'dec_tree__criterion': 'entropy', 'dec_tree__max_depth': 220, 'dec_tree__max_features': None, 'dec_tree__max_leaf_nodes': None, 'dec_tree__min_impurity_decrease': 0.0, 'dec_tree__min_samples_leaf': 2, 'dec_tree__min_samples_split': 8, 'dec_tree__min_weight_fraction_leaf': 0.0, 'dec_tree__random_state': 55, 'dec_tree__splitter': 'best'}<br><br>
Precision: 0.5955678670360111<br>
Recall: 0.4799107142857143<br>
Accuracy: 0.9407164085718754<br>
F1: 0.5315203955500618<br>
