# Classification through TF-IDF

In [1]:
import os
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score

import pandas as pd
pd.set_option('display.max_colwidth', None)

## Read data

In [2]:
train_df = pd.read_csv(os.path.join('data', 'train.csv'))

## Split training-validation data

In [3]:
training_df, validation_df = train_test_split(train_df,
                                              test_size=0.15,
                                              stratify=train_df.target) 

## Create TF-IDF features from message

In [4]:
tf_idf = TfidfVectorizer(max_features=5000)
tf_idf = tf_idf.fit(train_df.text)

training_tf_idf = tf_idf.transform(training_df.text)
validation_tf_idf = tf_idf.transform(validation_df.text)

## Training

In [5]:
model = GradientBoostingClassifier()
model.fit(training_tf_idf, training_df.target)

GradientBoostingClassifier()

In [6]:
def evaluate(X, y):
    f1_0, f1_1 = f1_score(y, model.predict(X), average=None)
    print(f'F1 score for negative class: {f1_0}')
    print(f'F1 score for positive class: {f1_1}')
    f1_macro = f1_score(y, model.predict(X), average='macro')
    print(f'F1 macro score: {f1_macro}')
    return f1_macro


train_f1_macro = evaluate(training_tf_idf, training_df.target)

F1 score for negative class: 0.837496411139822
F1 score for positive class: 0.7193852255825484
F1 macro score: 0.7784408183611852


In [7]:
val_f1_macro = evaluate(validation_tf_idf, validation_df.target)

F1 score for negative class: 0.7971586424625098
F1 score for positive class: 0.6192592592592593
F1 macro score: 0.7082089508608845


## Test evaluation

In [8]:
test_df = pd.read_csv(os.path.join('data', 'test.csv'))
test_f1_macro = evaluate(tf_idf.transform(test_df.text),
                         test_df.target)

F1 score for negative class: 0.7988826815642459
F1 score for positive class: 0.6619718309859154
F1 macro score: 0.7304272562750806


## Store metrics training metrics

In [9]:
with open('train_metrics.json', 'w') as f:
    json.dump({'f1_macro': train_f1_macro}, f)

with open('val_metrics.json', 'w') as f:
    json.dump({'f1_macro': val_f1_macro}, f)
    
with open('test_metrics.json', 'w') as f:
    json.dump({'f1_macro': test_f1_macro}, f)