# Decision tree Classification

## Preperation 

### Importing needed libraries

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import TfidfVectorizer

### Loading Data

In [2]:
train_data = pd.read_csv("../../data/220510_train_data_preprocessed.csv", sep=';')
test_data = pd.read_csv("../../data/220510_test_data_preprocessed.csv", sep=";")
train_data.head()

Unnamed: 0,id,label,tweet,tweet_converted_emojis,n_mentions,hashtags,without_punctuation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,1,['run'],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...,"['user', 'when', 'a', 'father', 'is', 'dysfunc...","['user', 'father', 'dysfunctional', 'selfish',...",['run'],"['user', 'father', 'dysfunct', 'selfish', 'dra...",['run'],"['user', 'father', 'dysfunctional', 'selfish',...",['run']
1,2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,2,"['lyft', 'disapointed', 'getthanked']",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...,"['user', 'user', 'thanks', 'for', 'lyft', 'cre...","['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']","['user', 'user', 'thank', 'lyft', 'credit', 'c...","['lyft', 'disapoint', 'getthank']","['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']"
2,3,0,bihday your majesty,bihday your majesty,0,[],bihday your majesty,bihday your majesty,"['bihday', 'your', 'majesty']","['bihday', 'majesty']",[],"['bihday', 'majesti']",[],"['bihday', 'majesty']",[]
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,0,['model'],model i love u take with u all the time in u...,model i love u take with u all the time in u...,"['model', 'i', 'love', 'u', 'take', 'with', 'u...","['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model']
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation,0,['motivation'],factsguide society now motivation,factsguide society now motivation,"['factsguide', 'society', 'now', 'motivation']","['factsguide', 'society', 'motivation']",['motivation'],"['factsguid', 'societi', 'motiv']",['motiv'],"['factsguide', 'society', 'motivation']",['motivation']


## Feature Selection Lemmatized

In [3]:
"""features = ["n_mentions", "lemmatized_hashtags", "lemmatized_tokens"]
X = data[features]
y = data.label
X.head()"""

'features = ["n_mentions", "lemmatized_hashtags", "lemmatized_tokens"]\nX = data[features]\ny = data.label\nX.head()'

## Bag of words model

Create the bag of words from the data

In [4]:
"""import texthero as hero
from texthero import preprocessing
X["bow_tokens"] = (hero.tfidf(X["lemmatized_tokens"], max_features=15000))
X["bow_hashtags"] = (hero.tfidf(X["lemmatized_hashtags"], max_features=15000))
X"""

'import texthero as hero\nfrom texthero import preprocessing\nX["bow_tokens"] = (hero.tfidf(X["lemmatized_tokens"], max_features=15000))\nX["bow_hashtags"] = (hero.tfidf(X["lemmatized_hashtags"], max_features=15000))\nX'

Reduce the dimension of the vector

In [5]:
"""X["bow_tokens"] = (hero.tsne(X["bow_tokens"]))
X["bow_hashtags"] = (hero.tsne(X["bow_hashtags"]))
X"""

'X["bow_tokens"] = (hero.tsne(X["bow_tokens"]))\nX["bow_hashtags"] = (hero.tsne(X["bow_hashtags"]))\nX'

Split vector into columns

In [6]:
"""X[["bow_tokens_1", "bow_tokens_"]] = pd.DataFrame(X.bow_tokens.tolist(), index= X.index)
X[["bow_hashtags_1", "bow_hashtags_2"]] = pd.DataFrame(X.bow_hashtags.tolist(), index= X.index)
X"""

'X[["bow_tokens_1", "bow_tokens_"]] = pd.DataFrame(X.bow_tokens.tolist(), index= X.index)\nX[["bow_hashtags_1", "bow_hashtags_2"]] = pd.DataFrame(X.bow_hashtags.tolist(), index= X.index)\nX'

# Lemmatisation

In [7]:
features = ["lemmatized_tokens", "lemmatized_hashtags", "n_mentions"]

X = train_data[features]
y = train_data.label

X.head()

Unnamed: 0,lemmatized_tokens,lemmatized_hashtags,n_mentions
0,"['user', 'father', 'dysfunctional', 'selfish',...",['run'],1
1,"['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']",2
2,"['bihday', 'majesty']",[],0
3,"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],0
4,"['factsguide', 'society', 'motivation']",['motivation'],0


### Using TF-IDF Vectorizer

In [8]:
X.shape

(31962, 3)

In [9]:
tf = TfidfVectorizer()

X_vec = tf.fit(X["lemmatized_tokens"])
X_lem = X_vec.transform(X["lemmatized_tokens"])

In [10]:
X_lem.shape

(31962, 40632)

## Split data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_lem, y, test_size=0.2, random_state = 17, stratify=y)

In [12]:
print(f"Shape X_train {X_train.shape}")
print(f"Shape X_test {X_test.shape}")

Shape X_train (25569, 40632)
Shape X_test (6393, 40632)


## Fitting model

In [25]:
result_precision_score = []
result_recall_score = []
result_accuracy_score = []
result_f1_score = []
max_depth = 100

for i in range(5, max_depth):
    classifier = DecisionTreeClassifier(random_state=55, max_depth=i)

    model = classifier.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    
    result_precision_score.append(metrics.precision_score(y_test, y_pred))
    result_recall_score.append(metrics.recall_score(y_test, y_pred))
    result_accuracy_score.append(metrics.accuracy_score(y_test, y_pred))
    result_f1_score.append(metrics.f1_score(y_test, y_pred))


#print(len([i for i in range(5, max_depth)]), len(result_precision_score),len(result_recall_score),len(result_accuracy_score),len(result_f1_score))


result_data = {
    "max_depth": [i for i in range(5, max_depth)],
    "precision_score": result_precision_score,
    "recall_score": result_recall_score,
    "accuracy_score": result_accuracy_score,
    "f1_score": result_f1_score
}
df = pd.DataFrame(result_data)
df

Unnamed: 0,max_depth,precision_score,recall_score,accuracy_score,f1_score
0,5,0.780142,0.245536,0.942281,0.373514
1,6,0.779221,0.267857,0.943376,0.398671
2,7,0.794118,0.301339,0.945565,0.436893
3,8,0.803681,0.292411,0.945409,0.428805
4,9,0.807018,0.308036,0.946348,0.445880
...,...,...,...,...,...
90,95,0.718182,0.529018,0.952448,0.609254
91,96,0.716049,0.517857,0.951822,0.601036
92,97,0.722045,0.504464,0.951666,0.593955
93,98,0.711246,0.522321,0.951666,0.602317


In [26]:
import plotly.express as px


fig = px.line(df, x="max_depth", y= ["f1_score", "accuracy_score"], title="Lemmatisation results")
fig.show()

[[1, 0.9278339764313276], [2, 0.9278339764313276], [3, 0.9296068411721764], [4, 0.9297111273334029], [5, 0.9311711335905726], [6, 0.9316925643967046], [7, 0.9320054228803838], [8, 0.9322139952028365], [9, 0.931275419751799], [10, 0.9327354260089686], [11, 0.932318281364063], [12, 0.9316925643967046], [13, 0.9298154134946293], [14, 0.9300239858170821], [15, 0.9292939826884973], [16, 0.9285639795599124], [17, 0.9292939826884973], [18, 0.9271039733027427], [19, 0.9248096777557618], [20, 0.9244968192720826], [21, 0.9206382313067056], [22, 0.9203253728230264], [23, 0.9161539263739702], [24, 0.9144853477943476], [25, 0.9147982062780269], [26, 0.911982479924914], [27, 0.9087496089268954], [28, 0.9072896026697257], [29, 0.9055167379288769], [30, 0.9042653039941599], [31, 0.9036395870268016], [32, 0.9040567316717072], [33, 0.9028052977369903], [34, 0.9028052977369903], [35, 0.903222442381896], [36, 0.9017624361247263], [37, 0.900928146834915], [38, 0.900928146834915], [39, 0.9011367191573678], [40, 0.9008238606736886], [41, 0.8987381374491605], [42, 0.8992595682552925], [43, 0.8989467097716133], [44, 0.8973824173532172], [45, 0.8958181249348212], [46, 0.8966524142246324], [47, 0.8961309834185004], [48, 0.8964438419021796], [49, 0.8957138387735948]]

Dataset split: 0.9278339764313276

## Try Up-sample Minority Class

Seperate Minortity Class

In [43]:
data_minority = train_data[train_data.label == 1]
data_majority = train_data[train_data.label == 0]
print("length majority", len(data_majority))
print("length minority", len(data_minority))

length majority 29720
length minority 2242


Upsample minority class

In [44]:
data_minority = resample(data_minority, replace = True, n_samples=29720, random_state=55)

Combine the classes

In [45]:
data_upsampled = pd.concat([data_majority, data_minority])
data_upsampled.label.value_counts()

0    29720
1    29720
Name: label, dtype: int64

In [46]:
data_upsampled.head()

Unnamed: 0,id,label,tweet,tweet_converted_emojis,n_mentions,hashtags,without_punctuation,tweet_lower,tweet_token,clean_token,clean_hashtags,stemmed_tokens,stemmed_hashtags,lemmatized_tokens,lemmatized_hashtags
0,1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,1,['run'],user when a father is dysfunctional and is so...,user when a father is dysfunctional and is so...,"['user', 'when', 'a', 'father', 'is', 'dysfunc...","['user', 'father', 'dysfunctional', 'selfish',...",['run'],"['user', 'father', 'dysfunct', 'selfish', 'dra...",['run'],"['user', 'father', 'dysfunctional', 'selfish',...",['run']
1,2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,2,"['lyft', 'disapointed', 'getthanked']",user user thanks for lyft credit i cant use ca...,user user thanks for lyft credit i cant use ca...,"['user', 'user', 'thanks', 'for', 'lyft', 'cre...","['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']","['user', 'user', 'thank', 'lyft', 'credit', 'c...","['lyft', 'disapoint', 'getthank']","['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']"
2,3,0,bihday your majesty,bihday your majesty,0,[],bihday your majesty,bihday your majesty,"['bihday', 'your', 'majesty']","['bihday', 'majesty']",[],"['bihday', 'majesti']",[],"['bihday', 'majesty']",[]
3,4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,0,['model'],model i love u take with u all the time in u...,model i love u take with u all the time in u...,"['model', 'i', 'love', 'u', 'take', 'with', 'u...","['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model']
4,5,0,factsguide: society now #motivation,factsguide: society now #motivation,0,['motivation'],factsguide society now motivation,factsguide society now motivation,"['factsguide', 'society', 'now', 'motivation']","['factsguide', 'society', 'motivation']",['motivation'],"['factsguid', 'societi', 'motiv']",['motiv'],"['factsguide', 'society', 'motivation']",['motivation']


In [47]:
features = ["lemmatized_tokens", "lemmatized_hashtags", "n_mentions"]

X_upsampled = data_upsampled[features]
y_upsampled = data_upsampled.label

X.head()

Unnamed: 0,lemmatized_tokens,lemmatized_hashtags,n_mentions
0,"['user', 'father', 'dysfunctional', 'selfish',...",['run'],1
1,"['user', 'user', 'thanks', 'lyft', 'credit', '...","['lyft', 'disapointed', 'getthanked']",2
2,"['bihday', 'majesty']",[],0
3,"['model', 'love', 'u', 'take', 'u', 'time', 'u...",['model'],0
4,"['factsguide', 'society', 'motivation']",['motivation'],0


### Using TF-IDF Vectorizer

In [48]:
X_upsampled.shape

(59440, 3)

In [49]:
tf = TfidfVectorizer()

X_vec = tf.fit(X_upsampled["lemmatized_tokens"])
X_lem_up = X_vec.transform(X_upsampled["lemmatized_tokens"])

In [50]:
X_lem_up.shape

(59440, 40632)

## Split data

In [53]:
X_train_up, X_test_up, y_train_up, y_test_up = train_test_split(X_lem_up, y_upsampled, test_size=0.2, random_state = 17, stratify=y_upsampled)

In [54]:
print(f"Shape X_train {X_train_up.shape}")
print(f"Shape X_test {X_test_up.shape}")

Shape X_train (47552, 40632)
Shape X_test (11888, 40632)


## Fitting model

In [58]:
result_precision_score = []
result_recall_score = []
result_accuracy_score = []
result_f1_score = []
max_depth = 100

for i in range(5, max_depth):
    classifier = DecisionTreeClassifier(random_state=55, max_depth=i)

    model = classifier.fit(X_train_up, y_train_up)

    y_pred = model.predict(X_test_up)
    
    result_precision_score.append(metrics.precision_score(y_test_up, y_pred))
    result_recall_score.append(metrics.recall_score(y_test_up, y_pred))
    result_accuracy_score.append(metrics.accuracy_score(y_test_up, y_pred))
    result_f1_score.append(metrics.f1_score(y_test_up, y_pred))


#print(len([i for i in range(5, max_depth)]), len(result_precision_score),len(result_recall_score),len(result_accuracy_score),len(result_f1_score))


result_data = {
    "max_depth": [i for i in range(5, max_depth)],
    "precision_score": result_precision_score,
    "recall_score": result_recall_score,
    "accuracy_score": result_accuracy_score,
    "f1_score": result_f1_score
}
df_upsampled = pd.DataFrame(result_data)
df_upsampled

Unnamed: 0,max_depth,precision_score,recall_score,accuracy_score,f1_score
0,5,0.712679,0.661003,0.697258,0.685869
1,6,0.718378,0.679341,0.706511,0.698314
2,7,0.735017,0.695323,0.722325,0.714619
3,8,0.745266,0.715175,0.735363,0.729911
4,9,0.750129,0.735868,0.745373,0.742930
...,...,...,...,...,...
90,95,0.935266,0.906629,0.921938,0.920724
91,96,0.923417,0.914872,0.919499,0.919124
92,97,0.915602,0.916218,0.915882,0.915910
93,98,0.914252,0.918405,0.916134,0.916324


In [59]:
import plotly.express as px


fig = px.line(df_upsampled, x="max_depth", y= ["f1_score", "accuracy_score"], title="Lemmatisation results upsampeled")
fig.show()