In [7]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from .cargar_datos import Cargar_datos

ModuleNotFoundError: No module named 'cargar_datos'

In [2]:

# ---------------- leectura y guardado de los documentos en un data frame de pandas.
main_dataframe = pd.DataFrame(columns=['sentiment', 'text'])

dataframe_1 = pd.read_csv('./Data/Tweets0.csv')
dataframe_2 = pd.read_csv('./Data/Tweets1.csv')
dataframe_3 = pd.read_csv('./Data/Tweets2.csv')
dataframe_4 = pd.read_csv('./Data/Tweets3.csv')

In [3]:

dataframe_1 = dataframe_1.filter(['airline_sentiment', 'text'])
dataframe_2 = dataframe_2.filter(['sentiment', 'text'])
dataframe_3 = dataframe_3.filter(['sentiment', 'text'])
dataframe_4 = dataframe_4.filter(['sentiment', 'text'])

In [4]:
dataframe_1.columns = ['sentiment', 'text']

dataframe_1['sentiment'] = dataframe_1.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})
dataframe_2['sentiment'] = dataframe_2.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})
dataframe_3['sentiment'] = dataframe_3.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})
dataframe_4['sentiment'] = dataframe_4.sentiment.map({'positive': 0, 'negative': 1, 'neutral': 2})

main_dataframe = pd.concat([dataframe_1, dataframe_2, dataframe_3, dataframe_4])
del dataframe_1, dataframe_2, dataframe_3, dataframe_4

print("\nTweets en el dataframe original: " + str(main_dataframe.shape[0]))


Tweets en el dataframe original: 16140


In [5]:
# ---------------- balanceamiento del dataset.
min_len = int(min(main_dataframe['sentiment'].value_counts()))
df_0 = resample(main_dataframe[main_dataframe.sentiment == 0], replace=False, n_samples=min_len)
df_1 = resample(main_dataframe[main_dataframe.sentiment == 1], replace=False, n_samples=min_len)
df_2 = resample(main_dataframe[main_dataframe.sentiment == 2], replace=False, n_samples=min_len)
new_main_dataframe = pd.concat([df_0, df_1, df_2])
del main_dataframe

print("Tweets en el dataframe balanceado: " + str(new_main_dataframe.shape[0]))


Tweets en el dataframe balanceado: 8010


In [6]:
#----------------- Lexicones
#Lectura de SentiWordNet Obtenido de 
#https://www.nltk.org/_modules/nltk/corpus/reader/sentiwordnet.html

# SentiWordNet[word] = {POS,	ID,	PosScore,	NegScore}
contador = 0
SentiWordNet = dict()
for lines in open('Data Lexicon/SentiWordNet_3.0.0.txt'):
    if lines.startswith('#'):
        continue
    line = lines.split('\t')
    palabra = line[4].split('#')[0]
    if (palabra in SentiWordNet) or (palabra==''):
        continue
    else:
        SentiWordNet[palabra]={'POS': line[0], 'ID': line[1], 'PosScore': line[2], 'NegScore': line[3]}

In [7]:
# AFFIN[word] = sentiment
AFFIN = dict()
for lines in open('Data Lexicon/AFFIN-111.txt'):
    AFFIN[lines.split('\t')[0]]=(lines.split('\t')[1]).split('\n')[0]

In [8]:
tt = TweetTokenizer()
def Lexicon(data):
    addlex = list()
    for frase in data:
        splited = tt.tokenize(frase)
        sum_swn_neg = 0
        sum_swn_pos = 0
        affin = 0
        stnet = 0
        word_stat = 0
        for word in splited:
            if word in SentiWordNet.keys():
                sum_swn_neg += float(SentiWordNet[word]['NegScore'])
                sum_swn_pos += float(SentiWordNet[word]['PosScore'])
            if word in AFFIN.keys():
                affin += float(AFFIN[word])
            
        addlex.append([sum_swn_neg, sum_swn_pos, affin])
    return addlex

In [9]:
def extension(matriz, data):
    extendLex = np.array(Lexicon(data))
    return np.append(matriz, extendLex, 1)

In [10]:
vectorizer = TfidfVectorizer(max_features=3000, min_df=7, max_df=0.8)

In [11]:
features = vectorizer.fit_transform(new_main_dataframe['text']).toarray()

In [12]:
labels = new_main_dataframe['sentiment']

In [13]:
vectorizacionLexicon = extension(features, new_main_dataframe['text'])

In [14]:
for i in vectorizacionLexicon: print(i)

[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.625 0.375 3.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.125 1.125 3.   ]
[0.    0.    0.    ... 0.5   1.125 4.   ]
[0.  0.  0.  ... 0.  0.5 1. ]
[0. 0. 0. ... 0. 0. 0.]
[0.  0.  0.  ... 0.  0.5 5. ]
[0.   0.   0.   ... 0.   0.25 0.  ]
[0. 0. 0. ... 0. 0. 0.]
[0.   0.   0.   ... 0.   0.25 5.  ]
[0.    0.    0.    ... 0.125 0.375 1.   ]
[0.    0.    0.    ... 0.    0.125 0.   ]
[0.    0.    0.    ... 0.625 0.    3.   ]
[0.    0.    0.    ... 0.375 1.25  8.   ]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 2.]
[0.    0.    0.    ... 0.    1.375 3.   ]
[ 0.     0.     0.    ...  0.375  0.625 -2.   ]
[0.    0.    0.    ... 0.625 0.    7.   ]
[0.    0.    0.    ... 0.125 1.25  6.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.  0.  0.  ... 0.5 0.  3. ]
[0.    0.    0.    ... 0.25  0.375 2.   ]
[ 0.     0.     0.    ...  0.     0.625 11.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.5   0.875 5.   ]
[0.   0.   0.   ... 0.25 1.25 2.  ]
[0.   0.  

[0. 0. 0. ... 0. 0. 2.]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.125 0.5   0.   ]
[0.    0.    0.    ... 0.375 0.375 0.   ]
[0.    0.    0.    ... 0.    0.625 3.   ]
[0.    0.    0.    ... 0.125 0.    2.   ]
[0.   0.   0.   ... 0.   0.75 2.  ]
[ 0.     0.     0.    ...  0.75   0.625 -1.   ]
[0.   0.   0.   ... 0.   0.75 2.  ]
[0.    0.    0.    ... 0.625 0.    0.   ]
[0.    0.    0.    ... 0.625 0.5   0.   ]
[0.    0.    0.    ... 0.875 0.5   0.   ]
[0.    0.    0.    ... 0.    1.375 4.   ]
[ 0.     0.     0.    ...  0.875  0.375 -3.   ]
[0.    0.    0.    ... 0.125 0.    3.   ]
[0.    0.    0.    ... 0.5   0.375 4.   ]
[0.    0.    0.    ... 0.125 0.    5.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.125 0.625 4.   ]
[0. 0. 0. ... 0. 0. 2.]
[0.    0.    0.    ... 0.875 0.    0.   ]
[0.    0.    0.    ... 0.    0.375 3.   ]
[0.    0.    0.    ... 0.375 1.    1.   ]
[0. 0. 0. ... 0. 0. 3.]
[0.    0.    0.    ... 0.125 0.    4.   ]
[0.    0.    0.    ... 0.125 0.    2.   

[0.    0.    0.    ... 0.125 0.125 5.   ]
[0.    0.    0.    ... 0.25  0.125 0.   ]
[0.    0.    0.    ... 0.125 0.    0.   ]
[0.    0.    0.    ... 0.125 0.375 5.   ]
[0.    0.    0.    ... 0.375 1.125 2.   ]
[0.    0.    0.    ... 0.625 1.25  0.   ]
[0.    0.    0.    ... 0.    0.625 0.   ]
[ 0.    0.    0.   ...  1.    0.75 -1.  ]
[ 0.     0.     0.    ...  1.125  1.125 -1.   ]
[0.   0.   0.   ... 0.   0.25 0.  ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.5   0.375 0.   ]
[0.    0.    0.    ... 0.625 0.375 0.   ]
[0.   0.   0.   ... 0.   1.25 3.  ]
[ 0.     0.     0.    ...  0.125  0.375 -1.   ]
[0.    0.    0.    ... 0.375 0.    3.   ]
[ 0.     0.     0.    ...  0.875  0.75  -1.   ]
[0.  0.  0.  ... 0.5 0.5 2. ]
[0.    0.    0.    ... 0.5   1.625 0.   ]
[ 0.     0.     0.    ...  1.625  1.    -6.   ]
[0.    0.    0.    ... 0.125 1.    2.   ]
[ 0.    0.    0.   ...  0.75  0.   -3.  ]
[ 0.     0.     0.    ...  1.5    1.375 -1.   ]
[0.    0.    0.    ... 0.875 0.375 0.   ]
[0. 

[0.    0.    0.    ... 1.875 0.875 0.   ]
[ 0.     0.     0.    ...  0.625  0.75  -2.   ]
[0.    0.    0.    ... 0.    0.375 0.   ]
[ 0.     0.     0.    ...  0.375  1.375 -2.   ]
[0.    0.    0.    ... 0.25  0.125 1.   ]
[0.   0.   0.   ... 0.   1.75 2.  ]
[0. 0. 0. ... 0. 0. 0.]
[ 0.  0.  0. ...  0.  0. -3.]
[ 0.     0.     0.    ...  0.5    0.125 -1.   ]
[ 0.     0.     0.    ...  0.875  1.    -2.   ]
[ 0.     0.     0.    ...  0.375  0.625 -1.   ]
[0.   0.   0.   ... 1.75 0.25 3.  ]
[0.    0.    0.    ... 0.625 0.625 0.   ]
[0.  0.  0.  ... 0.5 0.  0. ]
[ 0.     0.     0.    ...  0.25   0.125 -5.   ]
[0. 0. 0. ... 1. 1. 1.]
[0.    0.    0.    ... 1.125 0.5   0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 2.125 0.75  0.   ]
[0.    0.    0.    ... 0.125 1.125 2.   ]
[0.  0.  0.  ... 0.  0.5 0. ]
[ 0.   0.   0.  ...  0.5  1.  -2. ]
[0.    0.    0.    ... 0.    0.125 0.   ]
[ 0.    0.    0.   ...  0.25  0.   -2.  ]
[0.    0.    0.    ... 0.25  0.375 0.   

[0.    0.    0.    ... 0.625 0.75  2.   ]
[0.    0.    0.    ... 1.125 0.    0.   ]
[ 0.     0.     0.    ...  0.     1.375 -1.   ]
[0.    0.    0.    ... 0.625 1.5   4.   ]
[0. 0. 0. ... 0. 0. 0.]
[ 0.     0.     0.    ...  0.875  0.    -3.   ]
[ 0.     0.     0.    ...  0.625  0.375 -2.   ]
[ 0.  0.  0. ...  0.  0. -1.]
[ 0.     0.     0.    ...  0.75   0.875 -2.   ]
[ 0.     0.     0.    ...  0.625  0.375 -1.   ]
[0.    0.    0.    ... 0.    0.125 0.   ]
[ 0.     0.     0.    ...  0.875  0.75  -1.   ]
[ 0.     0.     0.    ...  1.125  0.5   -1.   ]
[0.    0.    0.    ... 0.    0.375 0.   ]
[ 0.  0.  0. ...  0.  0. -5.]
[ 0.  0.  0. ...  0.  0. -3.]
[ 0.    0.    0.   ...  2.5   0.25 -4.  ]
[ 0.    0.    0.   ...  0.    0.25 -1.  ]
[ 0.     0.     0.    ...  1.125  0.125 -1.   ]
[0.  0.  0.  ... 0.  0.5 2. ]
[0.    0.    0.    ... 0.375 1.    0.   ]
[ 0.     0.     0.    ...  1.75   0.125 -1.   ]
[ 0.     0.     0.    ...  1.625  0.875 -3.   ]
[ 0.     0.     0.    ...  2.25   0.625 

[0.    0.    0.    ... 0.375 0.625 2.   ]
[0.  0.  0.  ... 0.5 0.  0. ]
[0.    0.    0.    ... 0.25  0.625 2.   ]
[0.    0.    0.    ... 0.    0.125 0.   ]
[0.    0.    0.    ... 1.125 0.5   2.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.    0.625 0.   ]
[0. 0. 0. ... 0. 0. 0.]
[ 0.     0.     0.    ...  0.     0.625 -1.   ]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.625 0.    0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.    0.375 0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.         0.         0.26618039 ... 0.         1.25       0.        ]
[0. 0. 0. ... 0. 0. 0.]
[0.  0.  0.  ... 0.  0.5 0. ]
[0.    0.    0.    ... 0.625 0.875 1.   ]
[0.    0.    0.    ... 0.    0.125 2.   ]
[0.    0.    0.    ... 0.    1.125 2.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.375 0.25  0.   ]
[0. 0. 0. ... 0. 0. 0.]
[ 0.    0.    0.   ...  0.    0.25 -2.  ]
[0.    0.    0.    ... 0.875 0.375 0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. 

[ 0.     0.     0.    ...  0.     0.375 -3.   ]
[0.    0.    0.    ... 1.    0.375 0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.125 0.25  0.   ]
[0.    0.    0.    ... 0.25  0.375 0.   ]
[0.    0.    0.    ... 0.    0.125 1.   ]
[0.    0.    0.    ... 0.    0.125 0.   ]
[0.  0.  0.  ... 0.5 0.  0. ]
[0.   0.   0.   ... 0.   0.75 0.  ]
[ 0.     0.     0.    ...  1.25   0.375 -3.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.  0.  0.  ... 0.5 0.5 1. ]
[0.    0.    0.    ... 1.    0.625 3.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.625 0.125 0.   ]
[0. 0. 0. ... 0. 0. 2.]
[0.    0.    0.    ... 1.375 0.    0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.  0.  0.  ... 0.  0.5 2. ]
[ 0.  0.  0. ...  0.  0. -1.]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.625 0.    0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.    0.375 0.   ]
[0.    0.    0.    ... 0.    0.625 0.   ]
[0. 0. 0. ... 0. 0. 0.]
[0.    0.    0.    ... 0.125 0.    2.   ]
[0.  0.  0.  ... 0.  0.5 0. ]
[0.    0.    0.    ...

In [15]:
data_train, data_test, label_train, label_test = train_test_split(vectorizacionLexicon,labels)

### Metricas obtenidas usando AdaBoostClassifier con 6 modelos y una profundidad maxima de 8

In [16]:
modelo = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8), n_estimators=6)

In [17]:
modelo.fit(data_train, label_train)

AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=8,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

In [18]:
# ---------------- implementacion del modelo.
predictions0 = modelo.predict(data_test)

In [19]:

# ---------------- resultados del modelo.
datos_totales_prueba = len(label_test)
print("Numero de datos para la prueba: " + str(datos_totales_prueba))


Numero de datos para la prueba: 2003


In [20]:
matrix = confusion_matrix(label_test, predictions0)
aciertos = sum([matrix[i][i] for i in range(matrix.shape[0])])
precision = aciertos / datos_totales_prueba
print("Numero de aciertos totales: " + str(aciertos))
print("Precision total: " + str(precision))

Numero de aciertos totales: 1281
Precision total: 0.6395406889665501


In [21]:

def imprimir_metricas_matriz_confusion_multiclase(matrix, headers, total_datos):
    print("Numero de datos para la prueba: " + str(total_datos))
    metrics = np.zeros([len(headers), 2])
    for i in range(matrix.shape[0]):
        precision = matrix[i][i] / sum(matrix[i])
        recoil = matrix[i][i] / sum([matrix[y][i] for y in range(matrix.shape[0])])
        metrics[i][0] = precision
        metrics[i][-1] = recoil
    precision_promedio = sum([metrics[y][0] for y in range(metrics.shape[0])]) / metrics.shape[0]
    recall_promedio = sum([metrics[y][-1] for y in range(metrics.shape[0])]) / metrics.shape[0]
    print("Precision promedio: " + str(precision_promedio))
    print("Recall promedio: " + str(recall_promedio))
    print("\nMatriz de confusion: \n" + str(tabulate(
        matrix,
        headers=headers,
        showindex=headers,
        tablefmt='grid')
    ))
    print("\nMetricas de desempeño: \n" + str(tabulate(
        metrics,
        headers=['precision', 'recall'],
        showindex=headers,
        tablefmt='grid')
    ))
    print("")


imprimir_metricas_matriz_confusion_multiclase(
    matrix,
    ['positive', 'negative', 'neutral'],
    len(label_test)
)

Numero de datos para la prueba: 2003
Precision promedio: 0.6395011699894567
Recall promedio: 0.6468166044856534

Matriz de confusion: 
+----------+------------+------------+-----------+
|          |   positive |   negative |   neutral |
| positive |        404 |         82 |       186 |
+----------+------------+------------+-----------+
| negative |         71 |        458 |       144 |
+----------+------------+------------+-----------+
| neutral  |         86 |        153 |       419 |
+----------+------------+------------+-----------+

Metricas de desempeño: 
+----------+-------------+----------+
|          |   precision |   recall |
| positive |    0.60119  | 0.720143 |
+----------+-------------+----------+
| negative |    0.680535 | 0.660895 |
+----------+-------------+----------+
| neutral  |    0.636778 | 0.559413 |
+----------+-------------+----------+



### Metricas obtenidas usando RandomForest con 300 modelos

In [22]:
rfc = RandomForestClassifier(n_estimators=300, random_state=0)

In [23]:
rfc.fit(data_train, label_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [24]:
predictions = rfc.predict(data_test)

In [25]:
matrix1 = confusion_matrix(label_test, predictions)
aciertos = sum([matrix[i][i] for i in range(matrix.shape[0])])
precision = aciertos / datos_totales_prueba
print("Numero de aciertos totales: " + str(aciertos))
print("Precision total: " + str(precision))

Numero de aciertos totales: 1281
Precision total: 0.6395406889665501


In [26]:
def imprimir_metricas_matriz_confusion_multiclase(matrix, headers, total_datos):
    print("Numero de datos para la prueba: " + str(total_datos))
    metrics = np.zeros([len(headers), 2])
    for i in range(matrix.shape[0]):
        precision = matrix[i][i] / sum(matrix[i])
        recoil = matrix[i][i] / sum([matrix[y][i] for y in range(matrix.shape[0])])
        metrics[i][0] = precision
        metrics[i][-1] = recoil
    precision_promedio = sum([metrics[y][0] for y in range(metrics.shape[0])]) / metrics.shape[0]
    recall_promedio = sum([metrics[y][-1] for y in range(metrics.shape[0])]) / metrics.shape[0]
    print("Precision promedio: " + str(precision_promedio))
    print("Recall promedio: " + str(recall_promedio))
    print("\nMatriz de confusion: \n" + str(tabulate(
        matrix,
        headers=headers,
        showindex=headers,
        tablefmt='grid')
    ))
    print("\nMetricas de desempeño: \n" + str(tabulate(
        metrics,
        headers=['precision', 'recall'],
        showindex=headers,
        tablefmt='grid')
    ))
    print("")


imprimir_metricas_matriz_confusion_multiclase(
    matrix1,
    ['positive', 'negative', 'neutral'],
    len(label_test)
)

Numero de datos para la prueba: 2003
Precision promedio: 0.6938933224539117
Recall promedio: 0.7003294096921392

Matriz de confusion: 
+----------+------------+------------+-----------+
|          |   positive |   negative |   neutral |
| positive |        448 |         85 |       139 |
+----------+------------+------------+-----------+
| negative |         50 |        490 |       133 |
+----------+------------+------------+-----------+
| neutral  |         73 |        133 |       452 |
+----------+------------+------------+-----------+

Metricas de desempeño: 
+----------+-------------+----------+
|          |   precision |   recall |
| positive |    0.666667 | 0.784588 |
+----------+-------------+----------+
| negative |    0.728083 | 0.69209  |
+----------+-------------+----------+
| neutral  |    0.68693  | 0.624309 |
+----------+-------------+----------+

