In [1]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split

df = pd.read_csv('../../res/tp2/reviews_sentiment.csv', sep=';')

df.head()


Unnamed: 0,Review Title,Review Text,wordcount,titleSentiment,textSentiment,Star Rating,sentimentValue
0,Sin conexión,Hola desde hace algo más de un mes me pone sin...,23,negative,negative,1,-0.486389
1,faltan cosas,Han mejorado la apariencia pero no,20,negative,negative,1,-0.586187
2,Es muy buena lo recomiendo,Andres e puto amoooo,4,,negative,1,-0.60224
3,Version antigua,Me gustana mas la version anterior esta es mas...,17,,negative,1,-0.616271
4,Esta bien,Sin ser la biblia.... Esta bien,6,negative,negative,1,-0.651784


In [2]:
# 
df.columns = ['title', 'review', 'wordcount', 'titleSentiment', 'textSentiment', 'rating', 'value']

# Removemos las variable que no tienen importancia
df.drop('title', inplace=True, axis=1)
df.drop('review', inplace=True, axis=1)
df.drop('textSentiment', inplace=True, axis=1)


In [3]:
# Removemos valores NaN
df = df.drop(df[df['titleSentiment'].isnull()].index)

#  Como necesitamos que las variables sean numericas, vamos a cambiar los valores de negative y positive por {-1, 1}

df.loc[df.titleSentiment == 'negative', 'titleSentiment'] = -1
df.loc[df.titleSentiment == 'positive', 'titleSentiment'] = 1

df.head()

Unnamed: 0,wordcount,titleSentiment,rating,value
0,23,-1,1,-0.486389
1,20,-1,1,-0.586187
4,6,-1,1,-0.651784
5,8,1,1,-0.720443
6,23,1,1,-0.726825


In [4]:
# Los comentarios valorados con 1 estrella, ¿que cantidad promedio de palabras tienen?
word_count = 0
count = 0
for index, row in df.iterrows():    
    if (row['rating'] == 1):
        count += 1
        word_count += row['wordcount']

mean_words = word_count / count
print(f'El promedio de palabras para los comentarios valorados con 1 estrella es {mean_words}')

El promedio de palabras para los comentarios valorados con 1 estrella es 12.470588235294118


In [5]:
# Dividir el conjunto de datos en un conjunto de entrenamiento y otro de prueba
train, test = train_test_split(df, test_size=0.2)

In [13]:
# Aplicar los algoritmos K-NN y K-NN con distancias pesadas para clasificar las opiniones, utilizando como variable objetivo la variable rating y como variables
# explicativas las variables wordcount, titleSentiment, value y con k = 5 (k se podria pasar por parametro)

def calc_euclidean_distance(record_1, record_2):
    distance = 0
    for col in df.columns:
        distance += math.pow(record_1[col] - record_2[col], 2) 
    return math.sqrt(distance)
    
# Returns a dataframe/collection with k nearest neighbours sorted by distance asc
def get_k_nearest_neighbours(record, k):
    
    if (k >= len(train)):
       raise ValueError("K debe ser menor a N")
    
    # Creamos un dataframe vacio
    distance_df = pd.DataFrame(columns = ['distance', 'rating'])
    
    # Calculamos d(record, row) para cada fila del train dataframe
    for index, row in train.iterrows():

        distance = calc_euclidean_distance(record, row)
        distance_df.loc[index] = [distance, row['rating']]
    
    return distance_df.sort_values('distance').head(k)


# Teniendo un dataframe con los k vecinos mas cercanos ordenados de menor a mayor distancia, queremos ver que clase es la ganadora
def classify_rating_w_neighbours(k_neigh_df):
    rating_counts = { rating : 0 for rating in range(1,6) }
    for index, row in k_neigh_df.iterrows():
        rating = row['rating']
        rating_counts[rating] += 1

    return max(rating_counts, key=lambda key: rating_counts[key])

# Si d(xq, xi) = 0, entonces f (xq) = f (xi) y si son muchos los valores cercanos cuya distancia es cero, 
# se le asigna el valor que mas aparece
def classify_rating_w_neighbours_weighted(k_neigh_df):
    
    rating_weight = { rating : 0 for rating in range(1,6) }
    min_dist_rating = { rating : 0 for rating in range(1,6) }
    
    for index, row in k_neigh_df.iterrows():
        distance = row['distance']
        rating = row['rating']
        
        if (distance == 0):
            min_dist_rating[rating] += 1
            
        else:
            w = 1 / math.pow(distance, 2)
            rating_weight[rating] += w
    
    # Se encontraron valores con distancia 0
    if (any(elem > 0 for elem in  min_dist_rating.values())):
         return max(min_dist_rating, key=lambda key:  min_dist_rating[key])
        
    return max(rating_weight, key=lambda key: rating_weight[key])

k = 6
for index, row in test.iterrows():
    record_neigh_df = get_k_nearest_neighbours(row, k)
    print(record_neigh_df.head(6))
    classified_rating = classify_rating_w_neighbours_weighted(record_neigh_df)
    print(f'Classifed rating: {classified_rating}')
    

    
    

     distance  rating
36   2.188073     2.0
25   2.455402     1.0
154  2.978502     4.0
28   3.000760     2.0
84   3.019659     3.0
40   3.608130     2.0
Classifed rating: 2
     distance  rating
46   1.007137     3.0
93   1.414409     3.0
8    1.636361     1.0
101  2.236557     3.0
147  2.281598     4.0
1    2.326990     1.0
Classifed rating: 3
     distance  rating
199  0.047412     5.0
186  0.095715     5.0
177  0.132798     5.0
168  0.315003     5.0
211  0.334882     5.0
224  0.584668     5.0
Classifed rating: 5
     distance  rating
114  0.026483     3.0
119  0.036912     3.0
112  1.000946     3.0
122  1.001380     3.0
204  1.235536     4.0
165  1.424669     4.0
Classifed rating: 3
     distance  rating
40   1.003470     2.0
143  1.019988     2.0
84   1.147977     3.0
35   1.363211     2.0
25   1.415079     1.0
154  2.917094     4.0
Classifed rating: 2
     distance  rating
204  0.379591     4.0
175  1.000640     4.0
182  1.002695     4.0
165  1.014984     4.0
119  1.046692     3.

ZeroDivisionError: float division by zero

In [7]:
# Calcular la precision del clasificador y la matriz de confusion.