In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:

# ---------------- leectura y guardado de los documentos en un data frame de pandas.
main_dataframe = pd.DataFrame(columns=['sentiment', 'text'])

dataframe_1 = pd.read_csv('./Data/Tweets0.csv')
dataframe_2 = pd.read_csv('./Data/Tweets1.csv')
dataframe_3 = pd.read_csv('./Data/Tweets2.csv')
dataframe_4 = pd.read_csv('./Data/Tweets3.csv')


In [3]:

dataframe_1 = dataframe_1.filter(['airline_sentiment', 'text'])
dataframe_2 = dataframe_2.filter(['sentiment', 'text'])
dataframe_3 = dataframe_3.filter(['sentiment', 'text'])
dataframe_4 = dataframe_4.filter(['sentiment', 'text'])

In [4]:
dataframe_1.columns = ['sentiment', 'text']

dataframe_1['sentiment'] = dataframe_1.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})
dataframe_2['sentiment'] = dataframe_2.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})
dataframe_3['sentiment'] = dataframe_3.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})
dataframe_4['sentiment'] = dataframe_4.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})

main_dataframe = pd.concat([dataframe_1, dataframe_2, dataframe_3, dataframe_4])
del dataframe_1, dataframe_2, dataframe_3, dataframe_4

print("\nTweets en el dataframe original: " + str(main_dataframe.shape[0]))



Tweets en el dataframe original: 16140


In [5]:
# ---------------- balanceamiento del dataset.
min_len = int(min(main_dataframe['sentiment'].value_counts()))
df_0 = resample(main_dataframe[main_dataframe.sentiment == 0], replace=False, n_samples=min_len)
df_1 = resample(main_dataframe[main_dataframe.sentiment == 1], replace=False, n_samples=min_len)
df_2 = resample(main_dataframe[main_dataframe.sentiment == 2], replace=False, n_samples=min_len)
new_main_dataframe = pd.concat([df_0, df_1, df_2])
del main_dataframe

print("Tweets en el dataframe balanceado: " + str(new_main_dataframe.shape[0]))


Tweets en el dataframe balanceado: 8010


In [6]:

# ---------------- construccion de modelo de bolsa de palabras.
count_vector = CountVectorizer()
features = count_vector.fit_transform(new_main_dataframe['text'])


In [7]:

# ---------------- Separacion en data y labels de entrnamiento.
data_train, data_test, label_train, label_test = train_test_split(
    features,
    new_main_dataframe['sentiment']
)


In [8]:

# ---------------- entrenamiento del modelo.
modelo = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=6)
modelo.fit(data_train, label_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=8,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [9]:
# ---------------- implementacion del modelo.
predictions0 = modelo.predict(data_test)

In [10]:

# ---------------- resultados del modelo.
datos_totales_prueba = len(label_test)
print("Numero de datos para la prueba: " + str(datos_totales_prueba))


Numero de datos para la prueba: 2003


In [11]:
matrix = confusion_matrix(label_test, predictions0)
aciertos = sum([matrix[i][i] for i in range(matrix.shape[0])])
precision = aciertos / datos_totales_prueba
print("Numero de aciertos totales: " + str(aciertos))
print("Precision total: " + str(precision))

Numero de aciertos totales: 1259
Precision total: 0.6285571642536195


In [12]:
def imprimir_metricas_matriz_confusion_multiclase(matrix, headers, total_datos):
    print("Numero de datos para la prueba: " + str(total_datos))
    metrics = np.zeros([len(headers), 2])
    for i in range(matrix.shape[0]):
        precision = matrix[i][i] / sum(matrix[i])
        recoil = matrix[i][i] / sum([matrix[y][i] for y in range(matrix.shape[0])])
        metrics[i][0] = precision
        metrics[i][-1] = recoil
    precision_promedio = sum([metrics[y][0] for y in range(metrics.shape[0])]) / metrics.shape[0]
    recall_promedio = sum([metrics[y][-1] for y in range(metrics.shape[0])]) / metrics.shape[0]
    print("Precision promedio: " + str(precision_promedio))
    print("Recall promedio: " + str(recall_promedio))
    print("\nMatriz de confusion: \n" + str(tabulate(
        matrix,
        headers=headers,
        showindex=headers,
        tablefmt='grid')
    ))
    print("\nMetricas de desempeño: \n" + str(tabulate(
        metrics,
        headers=['precision', 'recall'],
        showindex=headers,
        tablefmt='grid')
    ))
    print("")


imprimir_metricas_matriz_confusion_multiclase(
    matrix,
    ['positive', 'negative', 'neutral'],
    len(label_test)
)

Numero de datos para la prueba: 2003
Precision promedio: 0.6274006309576078
Recall promedio: 0.6415748275219976

Matriz de confusion: 
+----------+------------+------------+-----------+
|          |   positive |   negative |   neutral |
| positive |        389 |         78 |       185 |
+----------+------------+------------+-----------+
| negative |         70 |        393 |       195 |
+----------+------------+------------+-----------+
| neutral  |         84 |        132 |       477 |
+----------+------------+------------+-----------+

Metricas de desempeño: 
+----------+-------------+----------+
|          |   precision |   recall |
| positive |    0.596626 | 0.71639  |
+----------+-------------+----------+
| negative |    0.597264 | 0.651741 |
+----------+-------------+----------+
| neutral  |    0.688312 | 0.556593 |
+----------+-------------+----------+



In [13]:
rfc = RandomForestClassifier(n_estimators=300, random_state=0)

In [14]:
rfc.fit(data_train, label_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
predictions = rfc.predict(data_test)

In [16]:
matrix1 = confusion_matrix(label_test, predictions)
aciertos = sum([matrix[i][i] for i in range(matrix.shape[0])])
precision = aciertos / datos_totales_prueba
print("Numero de aciertos totales: " + str(aciertos))
print("Precision total: " + str(precision))

Numero de aciertos totales: 1259
Precision total: 0.6285571642536195


In [17]:
def imprimir_metricas_matriz_confusion_multiclase(matrix, headers, total_datos):
    print("Numero de datos para la prueba: " + str(total_datos))
    metrics = np.zeros([len(headers), 2])
    for i in range(matrix.shape[0]):
        precision = matrix[i][i] / sum(matrix[i])
        recoil = matrix[i][i] / sum([matrix[y][i] for y in range(matrix.shape[0])])
        metrics[i][0] = precision
        metrics[i][-1] = recoil
    precision_promedio = sum([metrics[y][0] for y in range(metrics.shape[0])]) / metrics.shape[0]
    recall_promedio = sum([metrics[y][-1] for y in range(metrics.shape[0])]) / metrics.shape[0]
    print("Precision promedio: " + str(precision_promedio))
    print("Recall promedio: " + str(recall_promedio))
    print("\nMatriz de confusion: \n" + str(tabulate(
        matrix,
        headers=headers,
        showindex=headers,
        tablefmt='grid')
    ))
    print("\nMetricas de desempeño: \n" + str(tabulate(
        metrics,
        headers=['precision', 'recall'],
        showindex=headers,
        tablefmt='grid')
    ))
    print("")


imprimir_metricas_matriz_confusion_multiclase(
    matrix1,
    ['positive', 'negative', 'neutral'],
    len(label_test)
)

Numero de datos para la prueba: 2003
Precision promedio: 0.7002372793392242
Recall promedio: 0.7129572792561159

Matriz de confusion: 
+----------+------------+------------+-----------+
|          |   positive |   negative |   neutral |
| positive |        432 |         55 |       165 |
+----------+------------+------------+-----------+
| negative |         48 |        463 |       147 |
+----------+------------+------------+-----------+
| neutral  |         67 |        117 |       509 |
+----------+------------+------------+-----------+

Metricas de desempeño: 
+----------+-------------+----------+
|          |   precision |   recall |
| positive |    0.662577 | 0.789762 |
+----------+-------------+----------+
| negative |    0.703647 | 0.729134 |
+----------+-------------+----------+
| neutral  |    0.734488 | 0.619976 |
+----------+-------------+----------+

