In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

from keras.layers import Dense, Input, Activation, Conv1D
from keras.layers import Dropout, MaxPooling1D, Flatten, Concatenate, Reshape
from keras.models import Sequential, Model, load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import np_utils

import re
import random
import os

Using TensorFlow backend.


In [2]:
#Load the data into Pandas dataframe
df = pd.read_csv('../task2_data/task2_en_training.tsv', sep='\t')[['class', 'tweet']]
df['class'] = df['class'].apply(lambda x: int(x))
df, df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20544 entries, 0 to 20543
Data columns (total 2 columns):
class    20544 non-null int64
tweet    20544 non-null object
dtypes: int64(1), object(1)
memory usage: 321.1+ KB


(       class                                              tweet
 0          0                depression hurts, cymbalta can help
 1          0  @jessicama20045 right, but cipro can make thin...
 2          0         @fibby1123 are you on paxil .. i need help
 3          0  @redicine the lamotrigine and sjs just made ch...
 4          0  have decided to skip my #humira shot today. my...
 ...      ...                                                ...
 20539      1  @hornetweb my mri scan shows when it happens b...
 20540      1  remember 2003. vioxx costs $65m to ontario dru...
 20541      1  asians are at higher risk for severe allergic ...
 20542      1  5. so what caused the #estrogen surges in #nuv...
 20543      1  @twittalesskels 😂😂😂😂 i'm high off this tamiflu...
 
 [20544 rows x 2 columns], None)

In [3]:
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

X = df['tweet']
Y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.2)

tfidf = TfidfVectorizer()
tfidf.fit(X_train)

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X, Y)

print('Stratified K-Fold')
f1=[] # array of f1 scores 
for train_index, test_index in skf.split(X, Y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    
    # vectorization on the each step 
    tfidf = TfidfVectorizer()
    tfidf.fit(X_train)
    X_train = tfidf.transform(X_train)
    X_test = tfidf.transform(X_test)
    
    #model
    model = MLPClassifier()
    model.fit(X_train,y_train)

    predictions_model = model.predict(X_test)
    f1.append(round(f1_score(predictions_model, y_test, average='macro')*100,3))
    print("Model F1 Score -> ", f1[-1])

# avg f1
print(f'F1 average:\t {round(np.array(f1).mean(), 3)} %')
print(f'F1 std:\t {round(np.array(f1).std(), 3)}')
print(f'F1 var:\t {round(np.array(f1).var(), 3)}')
#print(stats.describe(f1))
print()

Stratified K-Fold
Model F1 Score ->  70.013
Model F1 Score ->  68.599
Model F1 Score ->  60.912
Model F1 Score ->  72.936
Model F1 Score ->  57.482
F1 average:	 65.988 %
F1 std:	 5.821
F1 var:	 33.882

