Import Libs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# own packages
from preprocess_data import clean_text
from preprocess_data import lemmatize_text
from preprocess_data import tfidf_vectorization
from preprocess_data import count_vectorizer
from models import model_rf_train
from models import model_multinominalNB_train
from model_MultinomialNB import model_mnb_train


Function for reading the data

In [2]:
# get csv data
def read_Data(path):

    # Define column names
    column_names = ['label', 'text']

    df = pd.read_csv(path, delimiter='\t', encoding="utf-8-sig", header=None, names=column_names)
    df.columns = df.columns.str.replace('\ufeff', '')
    df['label'] = df['label'].astype(str).str.replace('\ufeff', '')
    df['label'] = df['label'].astype(int)
    
    return df



In [3]:
## read data
data_train_val = read_Data(r".\data\training_data_lowercase.csv")           # used for train and val data
data_test = read_Data(r".\data\testing_data_lowercase_nolabels.csv")        # data for the predictions

## split data_train into train and val data
data_train, data_val = train_test_split(data_train_val, test_size=0.2, random_state=42)


In [4]:
 ## preview data
#print(data_train.head, "\n")
#print(data_test.head, "\n")
print("Shape train data:\n", data_train_val.shape)
print("Shape test data:\n", data_test.shape)

# clean data
data_train['cleaned_text'] = data_train['text'].apply(clean_text)
data_val['cleaned_text'] = data_val['text'].apply(clean_text)
data_test['cleaned_text'] = data_test['text'].apply(clean_text)

print("Cleand text train: \n", data_train["cleaned_text"].head, "\n")
print("Cleand text test: \n", data_test["cleaned_text"].head, "\n")


## lemmatize data
data_train['lemmatized_text'] = data_train['cleaned_text'].apply(lemmatize_text)
data_val['lemmatized_text'] = data_val['cleaned_text'].apply(lemmatize_text)
data_test['lemmatized_text'] = data_test['cleaned_text'].apply(lemmatize_text)

print("lemmatized_text train: \n", data_train['lemmatized_text'].head, "\n")
print("lemmatized_text test: \n", data_test['lemmatized_text'].head, "\n")

# calc tf-idf matrix
tfidf_matrix_train = tfidf_vectorization(data_train['lemmatized_text'])
tfidf_matrix_val = tfidf_vectorization(data_val['lemmatized_text'])
tfidf_matrix_test = tfidf_vectorization(data_test['lemmatized_text'])

print("tfidf_matrix_train_val: \n", tfidf_matrix_train, "\n")
print("tfidf_matrix_test: \n", tfidf_matrix_test, "\n")

# count vectorizer
X_train_vec = count_vectorizer(data_train['lemmatized_text']).toarray()
X_val_vec = count_vectorizer(data_val['lemmatized_text']).toarray()



Shape train data:
 (34152, 2)
Shape test data:
 (9984, 2)
Cleand text train: 
 <bound method NDFrame.head of 8891                                                    []
25115    [final, reckoning, approaches, obamas, high, c...
26933    [illinois, budget, talks, fizzle, amid, partis...
26971    [clinton, spokesman, ig, report, shows, clinto...
11387    [busted, nancy, pelosi, claims, meeting, russi...
                               ...                        
16850                  [senate, passes, usa, freedom, act]
6265     [oklahoma, republicans, trying, impeach, obama...
11284    [texas, congressman, lets, screaming, leftist,...
860      [trump, stole, idea, north, korean, propaganda...
15795    [outrageous, nancy, pelosi, claims, obamacare,...
Name: cleaned_text, Length: 27321, dtype: object> 

Cleand text test: 
 <bound method NDFrame.head of 0       [copycat, muslim, terrorist, arrested, assault...
1       [wow, chicago, protester, caught, camera, admi...
2       [germanys, fdp, 

In [None]:
## call models

#model_rf_train(tfidf_matrix_train, tfidf_matrix_val, data_train['label'], data_val['label'])                                           # Random Forest
model_multinominalNB_train(data_train['lemmatized_text'], data_val['lemmatized_text'], data_train['label'], data_val['label'])          # Multinomial Naive Bayes (MultinomialNB) classifier


Test
accuracy: 0.9322207583077148
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.94      0.93      3529
           1       0.93      0.93      0.93      3302

    accuracy                           0.93      6831
   macro avg       0.93      0.93      0.93      6831
weighted avg       0.93      0.93      0.93      6831

