In [1]:
import sys
import time
import warnings
from tqdm import tqdm
from math import log
#data processing
import pandas as pd
import numpy as np

#visualisation
import matplotlib.pyplot as plt 
import seaborn as sns

from scipy.stats import kruskal, pearsonr, ttest_ind
from sklearn import preprocessing
from collections import Counter
# Metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import log_loss, classification_report, confusion_matrix

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import ShuffleSplit

# ML
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import tree


from logic.feature_extraction import FeatureExtraction
from logic.text_analysis import TextAnalysis
from root import DIR_INPUT

In [2]:
lang = 'es'

In [3]:
ftrain = '{0}{1}{2}{3}'.format(DIR_INPUT, 'Valence_train_oc_', lang, '.csv')
ftest = '{0}{1}{2}{3}'.format(DIR_INPUT, 'Valence_test_oc_', lang, '.csv')

In [None]:
ta = TextAnalysis(lang=lang)

In [None]:
features = FeatureExtraction(lang=lang, text_analysis=ta)

In [None]:
train_data = pd.read_csv(ftrain, sep = ';')

In [None]:
train_data['clean_text'] = train_data['Tweet'].apply(lambda x: ta.clean_text(x))
train_data

In [None]:
train = train_data.drop(['ID', 'Tweet', 'Dimension', 'Description'], axis = 1)

In [None]:
train['Intensity'] = train['Intensity'].replace([-2, -3], -1)
train['Intensity'] = train['Intensity'].replace([2, 3], 1)

In [None]:
train

# Test

In [None]:
test_data = pd.read_csv(ftest, sep = ';')

In [None]:
test_data['clean_text'] = test_data['Tweet'].apply(lambda x: ta.clean_text(x))
test_data

In [None]:
test = test_data.drop(['ID', 'Tweet', 'Dimension', 'Description'], axis = 1)

In [None]:
test['Intensity'] = test['Intensity'].replace([-2, -3], -1)
test['Intensity'] = test['Intensity'].replace([2, 3], 1)
test

In [None]:
x_train = train['clean_text']
y_train = train['Intensity']

In [None]:
x_test = test['clean_text']
y_test = test['Intensity']

## Get training features

In [None]:
x_train = features.get_feature_phonestheme(x_train)

In [None]:
x_train = preprocessing.normalize(x_train)

## Get test features

In [None]:
x_test = features.get_feature_phonestheme(x_test)
x_test = preprocessing.normalize(x_test)

## Over sampling

In [None]:
print('**Sample train:', sorted(Counter(y_train).items()))
print('**Sample test:', sorted(Counter(y_test).items()))

In [None]:
ros_train = RandomUnderSampler(random_state=1000)
x_train, y_train = ros_train.fit_resample(x_train, y_train)
print('**OverSampler train:', sorted(Counter(y_train).items()))

In [None]:
ros_test = RandomOverSampler(random_state=1000)
x_test, y_test = ros_test.fit_resample(x_test, y_test)
print('**RandomOverSampler test:', sorted(Counter(y_test).items()))

In [None]:
k_fold = ShuffleSplit(n_splits=10, test_size=0.30, random_state=42)

In [None]:
models = [("RF", RandomForestClassifier(max_depth=200, n_estimators=200, random_state=42)),
          ("DT", DecisionTreeClassifier(max_depth = 4)),
          ("NB", GaussianNB())]

In [None]:
finalResults = []
cmList = []
for name, model in models:
    accuracies = []
    precisions = []
    recalls = []
    f1s = []
    for train_index, test_index in k_fold.split(x_train, y_train):
        data_train = x_train[train_index]
        target_train = y_train[train_index]

        data_test = x_train[test_index]
        target_test = y_train[test_index]
        
        model.fit(data_train, target_train)
        predict = model.predict(data_test)
         # Accuracy
        accuracy = accuracy_score(target_test, predict, normalize=True)
        accuracies.append(accuracy)
        # Precision
        precision = precision_score(target_test, predict, average="macro")
        precisions.append(precision)
        # recall
        recall = recall_score(target_test, predict, average="macro")
        recalls.append(recall)
        # f1
        f1 = f1_score(target_test, predict, average="macro")
        f1s.append(f1)
    
    y_predict = model.predict(x_test)
    cm= confusion_matrix(y_test, y_predict)
    cmList.append((name,cm))
    
    finalResults.append({'name':name, 
                         'model': model,
                         'accuracy': round(np.mean(accuracies), 2), 
                         'precision': round(np.mean(precisions), 2),
                         'recall': round(np.mean(recalls), 2),
                         'f1': round(np.mean(f1s), 2),
                         'confusion_matrix': cm
                        })

In [None]:
df_result = pd.DataFrame.from_dict(finalResults)
df_result

In [None]:
for name , i in cmList:
    plt.figure()
    sns.heatmap(i , annot =True, linewidth=0.8,fmt=".1f")
    plt.title(name)
    plt.show()