Import Libs

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# own packages
from preprocess_data import clean_text
from preprocess_data import lemmatize_text
from preprocess_data import tfidf_vec_transform, tfidf_vec_fit_transform
from preprocess_data import count_vectorizer
from models import model_sfnn_train
from models import model_rf_train, model_naives_bayes, model_logistic_regression, model_multinominalNB_train, predict_values



Function for reading the data

In [12]:
# get csv data
def read_Data(path):

    # Define column names
    column_names = ['label', 'text']

    df = pd.read_csv(path, delimiter='\t', encoding="utf-8-sig", header=None, names=column_names)
    df.columns = df.columns.str.replace('\ufeff', '')
    df['label'] = df['label'].astype(str).str.replace('\ufeff', '')
    df['label'] = df['label'].astype(int)
    
    return df



In [13]:
## read data windows
data_train_val = read_Data(r".\data\training_data_lowercase.csv")           # used for train and val data
data_test = read_Data(r".\data\testing_data_lowercase_nolabels.csv")        # data for the predictions

## read data Mac
#data_train_val = read_Data("data/training_data_lowercase.csv")           # used for train and val data
#data_test = read_Data("data/testing_data_lowercase_nolabels.csv")        # data for the predictions


## split data_train into train and val data
data_train, data_val = train_test_split(data_train_val, test_size=0.3, random_state=42, stratify=data_train_val['label'])


In [14]:
## preview data
#print(data_train.head, "\n")
#print(data_test.head, "\n")
print("Shape train data:\n", data_train_val.shape)
print("Shape test data:\n", data_test.shape)

# clean data
data_train['cleaned_text'] = data_train['text'].apply(clean_text)
data_val['cleaned_text'] = data_val['text'].apply(clean_text)
data_test['cleaned_text'] = data_test['text'].apply(clean_text)

print("Cleand text train: \n", data_train["cleaned_text"].head, "\n")
print("Cleand text test: \n", data_test["cleaned_text"].head, "\n")


## lemmatize data
data_train['lemmatized_text'] = data_train['cleaned_text'].apply(lemmatize_text)
data_val['lemmatized_text'] = data_val['cleaned_text'].apply(lemmatize_text)
data_test['lemmatized_text'] = data_test['cleaned_text'].apply(lemmatize_text)

print("lemmatized_text train: \n", data_train['lemmatized_text'].head, "\n")
print("lemmatized_text test: \n", data_test['lemmatized_text'].head, "\n")

# calc tf-idf matrix
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, ngram_range=(1,1))

tfidf_matrix_train = vectorizer.fit_transform(data_train['lemmatized_text'])
tfidf_matrix_val = vectorizer.transform(data_val['lemmatized_text'])
tfidf_matrix_test = vectorizer.transform(data_test['lemmatized_text'])

# calc tf-idf matrix with uncleand data
X_train_tfidf = vectorizer.fit_transform(data_train['text'])
X_val_tfidf = vectorizer.transform(data_val['text'])
X_test_tfidf = vectorizer.transform(data_test['text'])

#print("tfidf_matrix_train_val: \n", tfidf_matrix_train, "\n")
#print("tfidf_matrix_test: \n", tfidf_matrix_test, "\n")


# count vectorizer
#X_train_vec = count_vectorizer(data_train['lemmatized_text']).toarray()
#X_val_vec = count_vectorizer(data_val['lemmatized_text']).toarray()



Shape train data:
 (34152, 2)
Shape test data:
 (9984, 2)
Cleand text train: 
 <bound method NDFrame.head of 21024    [trumps, paris, decision, followed, debate, mo...
1057     [texas, republican, bringing, two, antilgbt, b...
10514    [oops, mn, juror, case, cop, killed, philando,...
11891    [awesome, trump, tweets, two, rules, guide, tr...
32054    [australian, medical, group, wants, access, ma...
                               ...                        
5267     [trump, sends, hillary, pathetic, threat, one,...
13642    [breaking, fbi, investigating, clinton, crony,...
22524    [us, lawmakers, seek, visas, afghans, helped, ...
883                                         [like, father]
3939     [hilariously, amazing, video, weve, waiting, e...
Name: cleaned_text, Length: 23906, dtype: object> 

Cleand text test: 
 <bound method NDFrame.head of 0       [copycat, muslim, terrorist, arrested, assault...
1       [wow, chicago, protester, caught, camera, admi...
2       [germanys, fdp, 

In [None]:
# Naive Bayes Model
model_nb = model_naives_bayes(tfidf_matrix_train, data_train['label'], tfidf_matrix_val, data_val['label'])


Train accuracy: 0.9527315318330126

 Validation Accuracy:0.928069

classification_report:
               precision    recall  f1-score   support

           0     0.9212    0.9406    0.9308      5272
           1     0.9356    0.9148    0.9251      4974

    accuracy                         0.9281     10246
   macro avg     0.9284    0.9277    0.9280     10246
weighted avg     0.9282    0.9281    0.9280     10246



In [None]:
# Random Forest
model_rf = model_rf_train(tfidf_matrix_train, tfidf_matrix_val, data_train['label'], data_val['label'])

Train Accuracy: 100.00%
Random forest model
accuracy: 0.9124536404450517
Classification report:
               precision    recall  f1-score   support

           0       0.91      0.92      0.91      5272
           1       0.91      0.91      0.91      4974

    accuracy                           0.91     10246
   macro avg       0.91      0.91      0.91     10246
weighted avg       0.91      0.91      0.91     10246



In [17]:
# Multinomial Naive Bayes (MultinomialNB) classifier
model_mnb = model_multinominalNB_train(data_train['lemmatized_text'], data_val['lemmatized_text'], data_train['label'], data_val['label'])


Train Accuracy: 95.09%
Test
accuracy: 0.9331446418114386
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.94      5272
           1       0.94      0.93      0.93      4974

    accuracy                           0.93     10246
   macro avg       0.93      0.93      0.93     10246
weighted avg       0.93      0.93      0.93     10246



In [18]:
# Simple Feedforward NN -> no good results
model_sfnn_train(tfidf_matrix_train, data_train['label'], tfidf_matrix_val, data_val['label'])


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8379 - loss: 0.3904 - val_accuracy: 0.9124 - val_loss: 0.2191
Epoch 2/20
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9193 - loss: 0.2017 - val_accuracy: 0.9175 - val_loss: 0.2071
Epoch 3/20
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9316 - loss: 0.1735 - val_accuracy: 0.9203 - val_loss: 0.2043
Epoch 4/20
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9448 - loss: 0.1460 - val_accuracy: 0.9198 - val_loss: 0.2105
Epoch 5/20
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9577 - loss: 0.1145 - val_accuracy: 0.9233 - val_loss: 0.2112
Epoch 6/20
[1m187/187[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9700 - loss: 0.0861 - val_accuracy: 0.9223 - val_loss: 0.2204
Epoch 7/20
[1m187/187[0m [32m━━━━━━━

<Sequential name=sequential_1, built=True>

In [19]:
model_lr = model_logistic_regression(tfidf_matrix_train, data_train['label'], tfidf_matrix_val, data_val['label'])


Train Accuracy: 95.41%

Validation Accuracy: 92.68%

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      5272
           1       0.91      0.94      0.93      4974

    accuracy                           0.93     10246
   macro avg       0.93      0.93      0.93     10246
weighted avg       0.93      0.93      0.93     10246



In [None]:
## make predictions on test data
filepath = r".\data\testing_data_lowercase_labels.csv"

predict_values(model_rf, tfidf_matrix_test, data_test, filepath)
predict_values(model_nb, tfidf_matrix_test, data_test, filepath)

✅ Predictions saved to: .\data\testing_data_lowercase_labels.csv
