Import Libs

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# own packages
from preprocess_data import clean_text
from preprocess_data import lemmatize_text
from preprocess_data import tfidf_vec_transform, tfidf_vec_fit_transform
from preprocess_data import count_vectorizer
from models import model_sfnn_train
from models import model_rf_train, model_naives_bayes, model_logistic_regression, model_multinominalNB_train, predict_values, model_xgboost



## Importing and reading the data

In [3]:
# get csv data
def read_Data(path):

    # Define column names
    column_names = ['label', 'text']

    df = pd.read_csv(path, delimiter='\t', encoding="utf-8-sig", header=None, names=column_names)
    df.columns = df.columns.str.replace('\ufeff', '')
    df['label'] = df['label'].astype(str).str.replace('\ufeff', '')
    df['label'] = df['label'].astype(int)
    
    return df



In [6]:
## read data windows
#data_train_val = read_Data(r".\data\training_data_lowercase.csv")           # used for train and val data
#data_test = read_Data(r".\data\testing_data_lowercase_nolabels.csv")        # data for the predictions

## read data Mac
data_train_val = read_Data("data/training_data_lowercase.csv")           # used for train and val data
data_test = read_Data("data/testing_data_lowercase_nolabels.csv")        # data for the predictions


## split data_train into train and val data
data_train, data_val = train_test_split(data_train_val, test_size=0.08, random_state=42, stratify=data_train_val['label'])


In [None]:
## preview data
#print(data_train.head, "\n")
#print(data_test.head, "\n")
print("Shape train data:\n", data_train_val.shape)
print("Shape test data:\n", data_test.shape)


## Pre-processing data

Setup with advanced data cleaning

In [7]:
# Preprocessing steps

# clean data
data_train['cleaned_text'] = data_train['text'].apply(clean_text)
data_val['cleaned_text'] = data_val['text'].apply(clean_text)
data_test['cleaned_text'] = data_test['text'].apply(clean_text)

print("Cleand text train: \n", data_train["cleaned_text"].head, "\n")
print("Cleand text test: \n", data_test["cleaned_text"].head, "\n")


## lemmatize data
data_train['lemmatized_text'] = data_train['cleaned_text'].apply(lemmatize_text)
data_val['lemmatized_text'] = data_val['cleaned_text'].apply(lemmatize_text)
data_test['lemmatized_text'] = data_test['cleaned_text'].apply(lemmatize_text)

print("lemmatized_text train: \n", data_train['lemmatized_text'].head, "\n")
print("lemmatized_text test: \n", data_test['lemmatized_text'].head, "\n")

# calc tf-idf matrix on lemmatized data
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf_matrix_train = vectorizer.fit_transform(data_train['lemmatized_text'])
tfidf_matrix_val = vectorizer.transform(data_val['lemmatized_text'])
tfidf_matrix_test = vectorizer.transform(data_test['lemmatized_text'])


Cleand text train: 
 <bound method NDFrame.head of 6602     [fox, host, throws, hissy, fit, cant, call, pr...
930      [trump, unpopular, crowds, bussed, cheer, pola...
29744    [congratulations, eu, moves, brexit, phase, tw...
24734    [trumps, national, security, adviser, vows, ta...
15772    [republican, turns, tables, fbi, deputy, direc...
                               ...                        
27469    [us, pledges, strong, response, event, another...
27793    [republican, presidential, candidate, cruz, ra...
883                                         [like, father]
25547    [congress, passes, funding, bill, averting, go...
3939     [hilariously, amazing, video, weve, waiting, e...
Name: cleaned_text, Length: 31419, dtype: object> 

Cleand text test: 
 <bound method NDFrame.head of 0       [copycat, muslim, terrorist, arrested, assault...
1       [wow, chicago, protester, caught, camera, admi...
2       [germanys, fdp, look, fill, schaeubles, big, s...
3       [mi, school, sen

**TF-IDF only**

With and without stop-words function

Pre-processing setup 2: TF-IDF Vectorizer (without stopwords argument)

In [14]:
# calc tf-idf matrix with uncleaned data
vectorizer2 = TfidfVectorizer(ngram_range=(1, 3))
X_train_tfidf = vectorizer2.fit_transform(data_train['text'])
X_val_tfidf = vectorizer2.transform(data_val['text'])
X_test_tfidf = vectorizer2.transform(data_test['text'])


Pre-processing setup 3: TF-IDF Vectorizer (without stopwords argument)

In [None]:

# calc tf-idf matrix with uncleaned data, tf-idf removing stopwords
vectorizer3 = TfidfVectorizer(stop_words="english", ngram_range=(1, 3))
X_train_tfidf3 = vectorizer3.fit_transform(data_train['text'])
X_val_tfidf3 = vectorizer3.transform(data_val['text'])
X_test_tfidf3 = vectorizer3.transform(data_test['text'])

## Applying and evaluating models

### Naive Bayes Model

In [18]:
# Preprocessing setup 1: calc tf-idf on lemmatized data
model_nb1 = model_naives_bayes(tfidf_matrix_train, data_train['label'], tfidf_matrix_val, data_val['label'])


Train Accuracy:1.00

Validation Accuracy: 0.94

classification_report:
               precision    recall  f1-score   support

           0     0.9390    0.9531    0.9460      1406
           1     0.9495    0.9344    0.9419      1327

    accuracy                         0.9440      2733
   macro avg     0.9442    0.9437    0.9439      2733
weighted avg     0.9441    0.9440    0.9440      2733



In [19]:
# Preprocessing setup 2: tf-idf only
model_nb2 = model_naives_bayes(X_train_tfidf, data_train['label'], X_val_tfidf, data_val['label'])


Train Accuracy:1.00

Validation Accuracy: 0.95

classification_report:
               precision    recall  f1-score   support

           0     0.9383    0.9623    0.9501      1406
           1     0.9589    0.9329    0.9458      1327

    accuracy                         0.9480      2733
   macro avg     0.9486    0.9476    0.9480      2733
weighted avg     0.9483    0.9480    0.9480      2733



In [20]:
# Preprocessing setup 3: tf-idf removing stopwords
model_nb3 = model_naives_bayes(X_train_tfidf3, data_train['label'], X_val_tfidf3, data_val['label'])


Train Accuracy:1.00

Validation Accuracy: 0.94

classification_report:
               precision    recall  f1-score   support

           0     0.9350    0.9516    0.9432      1406
           1     0.9478    0.9299    0.9388      1327

    accuracy                         0.9411      2733
   macro avg     0.9414    0.9408    0.9410      2733
weighted avg     0.9412    0.9411    0.9411      2733



### XGBoost

In [21]:
# XGBoost Setup 1
model_xgb1 = model_xgboost(tfidf_matrix_train, data_train['label'], tfidf_matrix_val, data_val['label'])

Train Accuracy: 0.90

Validation Accuracy: 0.84

Classification Report:
               precision    recall  f1-score   support

           0     0.8827    0.7973    0.8378      1406
           1     0.8052    0.8877    0.8444      1327

    accuracy                         0.8412      2733
   macro avg     0.8439    0.8425    0.8411      2733
weighted avg     0.8451    0.8412    0.8410      2733



In [22]:
# XGBoost Setup 2
model_xgb2 = model_xgboost(X_train_tfidf, data_train['label'], X_val_tfidf, data_val['label'])

Train Accuracy: 0.93

Validation Accuracy: 0.83

Classification Report:
               precision    recall  f1-score   support

           0     0.8162    0.8620    0.8385      1406
           1     0.8446    0.7943    0.8186      1327

    accuracy                         0.8291      2733
   macro avg     0.8304    0.8281    0.8286      2733
weighted avg     0.8299    0.8291    0.8288      2733



In [23]:
# XGBoost Setup 3
model_xgb3 = model_xgboost(X_train_tfidf3, data_train['label'], X_val_tfidf3, data_val['label'])

Train Accuracy: 0.90

Validation Accuracy: 0.82

Classification Report:
               precision    recall  f1-score   support

           0     0.8192    0.8378    0.8284      1406
           1     0.8239    0.8041    0.8139      1327

    accuracy                         0.8214      2733
   macro avg     0.8216    0.8210    0.8211      2733
weighted avg     0.8215    0.8214    0.8214      2733



### Random Forest

In [24]:
# Random Forest setup 1
model_rf1 = model_rf_train(tfidf_matrix_train, tfidf_matrix_val, data_train['label'], data_val['label'])

Train Accuracy: 100.00%
Random forest model
accuracy: 0.9063300402488108
Classification report:
               precision    recall  f1-score   support

           0       0.91      0.90      0.91      1406
           1       0.90      0.91      0.90      1327

    accuracy                           0.91      2733
   macro avg       0.91      0.91      0.91      2733
weighted avg       0.91      0.91      0.91      2733



In [25]:
# Random Forest setup 2
model_rf2 = model_rf_train(X_train_tfidf, X_val_tfidf, data_train['label'], data_val['label'])

Train Accuracy: 100.00%
Random forest model
accuracy: 0.9085254299304794
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.89      0.91      1406
           1       0.89      0.93      0.91      1327

    accuracy                           0.91      2733
   macro avg       0.91      0.91      0.91      2733
weighted avg       0.91      0.91      0.91      2733



In [26]:
# Random Forest setup 2
model_rf3 = model_rf_train(X_train_tfidf3, X_val_tfidf3, data_train['label'], data_val['label'])

Train Accuracy: 100.00%
Random forest model
accuracy: 0.9008415660446396
Classification report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      1406
           1       0.92      0.87      0.90      1327

    accuracy                           0.90      2733
   macro avg       0.90      0.90      0.90      2733
weighted avg       0.90      0.90      0.90      2733



In [8]:
# Multinomial Naive Bayes (MultinomialNB) classifier
#model_mnb = model_multinominalNB_train(data_train['lemmatized_text'], data_val['lemmatized_text'], data_train['label'], data_val['label'])


### Logistic Regression

In [27]:
#Logistic Regression setup1
model_lr1 = model_logistic_regression(tfidf_matrix_train, data_train['label'], tfidf_matrix_val, data_val['label'])


Train Accuracy: 96.91%

Validation Accuracy: 93.27%

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      1406
           1       0.92      0.94      0.93      1327

    accuracy                           0.93      2733
   macro avg       0.93      0.93      0.93      2733
weighted avg       0.93      0.93      0.93      2733



In [28]:
#Logistic Regression setup2
model_lr2 = model_logistic_regression(X_train_tfidf, data_train['label'], X_val_tfidf, data_val['label'])


Train Accuracy: 97.62%

Validation Accuracy: 94.04%

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.94      0.94      1406
           1       0.94      0.94      0.94      1327

    accuracy                           0.94      2733
   macro avg       0.94      0.94      0.94      2733
weighted avg       0.94      0.94      0.94      2733



In [29]:
#Logistic Regression setup3
model_lr3 = model_logistic_regression(X_train_tfidf3, data_train['label'], X_val_tfidf3, data_val['label'])


Train Accuracy: 97.54%

Validation Accuracy: 92.54%

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      1406
           1       0.91      0.94      0.92      1327

    accuracy                           0.93      2733
   macro avg       0.93      0.93      0.93      2733
weighted avg       0.93      0.93      0.93      2733



In [33]:
# Simple Feedforward NN with preprocessing setup1
model_sfnn1 = model_sfnn_train(tfidf_matrix_train, data_train['label'], tfidf_matrix_val, data_val['label'])


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8606 - loss: 0.3232 - val_accuracy: 0.9290 - val_loss: 0.1884
Epoch 2/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9380 - loss: 0.1592 - val_accuracy: 0.9327 - val_loss: 0.1737
Epoch 3/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9522 - loss: 0.1225 - val_accuracy: 0.9356 - val_loss: 0.1793
Epoch 4/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9676 - loss: 0.0882 - val_accuracy: 0.9341 - val_loss: 0.1864
Epoch 5/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9810 - loss: 0.0584 - val_accuracy: 0.9393 - val_loss: 0.2043
Epoch 6/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9869 - loss: 0.0402 - val_accuracy: 0.9371 - val_loss: 0.2254
Epoch 7/20
[1m246/246[0m [32m━━━━━━━

In [32]:
# Simple Feedforward NN with preprocessing setup2
model_sfnn2 = model_sfnn_train(X_train_tfidf, data_train['label'], X_val_tfidf, data_val['label'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8642 - loss: 0.3097 - val_accuracy: 0.9404 - val_loss: 0.1570
Epoch 2/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9472 - loss: 0.1378 - val_accuracy: 0.9447 - val_loss: 0.1581
Epoch 3/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9591 - loss: 0.1082 - val_accuracy: 0.9477 - val_loss: 0.1577
Epoch 4/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9722 - loss: 0.0777 - val_accuracy: 0.9437 - val_loss: 0.1796
Epoch 5/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.9812 - loss: 0.0564 - val_accuracy: 0.9480 - val_loss: 0.1836
Epoch 6/20
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9870 - loss: 0.0399 - val_accuracy: 0.9477 - val_loss: 0.2092
[1m982/982[0m [32m━━━━━━━

In [None]:
# Simple Feedforward NN with preprocessing setup2
model_sfnn3 = model_sfnn_train(X_train_tfidf3, data_train['label'], X_val_tfidf3, data_val['label'])

In [11]:
## make predictions on test data
filepath = r".\data\testing_data_lowercase_labels.csv"

#predict_values(model_rf, tfidf_matrix_test, data_test, filepath)
predict_values(model_nb, tfidf_matrix_test, data_test, filepath)

✅ Predictions saved to: .\data\testing_data_lowercase_labels.csv
