# Consigna

def developer_reviews_analysis( desarrolladora : str ):
Según el desarrollador, se devuelve un diccionario con el nombre del desarrollador como llave y una lista con la cantidad total
de registros de reseñas de usuarios que se encuentren categorizados con un análisis de sentimiento como valor positivo o negativo.
Ejemplo de retorno: {'Valve' : [Negative = 182, Positive = 278]}

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans


In [2]:
# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahurt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahurt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahurt\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
reviews_path = '../ETL/Data_Extracted/DataFrame_reviews.parquet'
ReviewsDF = pd.read_parquet(reviews_path)

In [4]:
ReviewsDF.head(2)

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id,user_url,reviews
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'helpful': 'No ratings yet', 'it..."
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"{'funny': '', 'helpful': 'No ratings yet', 'it..."


In [5]:
ReviewsDF = ReviewsDF[['user_id','item_id','review']]

In [6]:
ReviewsDF

Unnamed: 0,user_id,item_id,review
0,76561197970982479,1250,Simple yet with great replayability. In my opi...
1,76561197970982479,22200,It's unique and worth a playthrough.
2,76561197970982479,43110,Great atmosphere. The gunplay can be a bit chu...
3,js41637,251610,I know what you think when you see this title ...
4,js41637,227300,For a simple (it's actually not all that simpl...
...,...,...,...
59300,76561198312638244,70,a must have classic from steam definitely wort...
59301,76561198312638244,362890,this game is a perfect remake of the original ...
59302,LydiaMorley,273110,had so much fun plaing this and collecting res...
59303,LydiaMorley,730,:D


# Analisis sentimental

In [7]:
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Tokenización
    tokens = word_tokenize(text)
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatización
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [8]:
ReviewsDF['review'] = ReviewsDF['review'].apply(preprocess_text)
# Vectorización de texto
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(ReviewsDF['review'])

In [9]:
# Clustering
k = 3  # Número de clusters (negativo, neutral, positivo)
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)

# Obtener etiquetas de cluster
cluster_labels = kmeans.labels_

# Asignar etiquetas a los clusters
ReviewsDF['sentiment_analysis'] = cluster_labels

In [10]:
ReviewsDF

Unnamed: 0,user_id,item_id,review,sentiment_analysis
0,76561197970982479,1250,simple yet great replayability . opinion `` zo...,2
1,76561197970982479,22200,'s unique worth playthrough .,2
2,76561197970982479,43110,great atmosphere . gunplay bit chunky time end...,2
3,js41637,251610,know think see title `` barbie dreamhouse part...,2
4,js41637,227300,simple ( 's actually simple ! ) truck driving ...,2
...,...,...,...,...
59300,76561198312638244,70,must classic steam definitely worth buying .,2
59301,76561198312638244,362890,game perfect remake original half life . perso...,2
59302,LydiaMorley,273110,much fun plaing collecting resource xd first t...,2
59303,LydiaMorley,730,:,2


In [11]:
games_path = '../ETL/Data_Extracted/DataFrame_games.parquet'
GamesDF = pd.read_parquet(games_path)

In [13]:
GamesDF = GamesDF.rename(columns={'id': 'item_id'})
GamesDF = GamesDF[['item_id','developer']]
GamesDF

Unnamed: 0,item_id,developer
0,761140,Kotoshiro
1,643980,Secret Level SRL
2,670290,Poolians.com
3,767400,彼岸领域
4,772540,Trickjump Games Ltd
...,...,...
27457,745400,Bidoniera Games
27458,773640,"Nikita ""Ghost_RUS"""
27459,733530,Sacada
27460,610660,Laush Dmitriy Sergeevich


In [18]:
BestDevDF= ReviewsDF.merge(GamesDF, on="item_id")

In [19]:
BestDevDF = BestDevDF.drop(columns=['review','user_id','item_id'])
BestDevDF

Unnamed: 0,sentiment_analysis,developer
0,2,Tripwire Interactive
1,2,ACE Team
2,2,SCS Software
3,2,3909
4,2,"Hopoo Games, LLC"
...,...,...
49611,2,Valve
49612,2,Crowbar Collective
49613,2,Nexon
49614,2,Valve


In [20]:
def developer_reviews_analysis(desarrolladora: str):
    # Filtrar el DataFrame por el desarrollador dado
    developer_df = BestDevDF[BestDevDF['developer'] == desarrolladora]

    # Contar el número de registros para cada valor de análisis de sentimiento
    positive_count = (developer_df['sentiment_analysis'] == 2).sum()
    neutral_count = (developer_df['sentiment_analysis'] == 1).sum()
    negative_count = (developer_df['sentiment_analysis'] == 0).sum()

    # Construir el diccionario de retorno
    analysis_dict = {
        desarrolladora: {
            'Positive': positive_count,
            'Neutral': neutral_count,
            'Negative': negative_count
        }
    }

    return analysis_dict

In [21]:
#prueba
print(developer_reviews_analysis('Capcom'))

{'Capcom': {'Positive': 96, 'Neutral': 7, 'Negative': 0}}


In [22]:
# Genero el archivo para hacer las consultas en la api
BestDevDF.to_parquet('./Out/DataFrame-punto05_sentimental_analysis.parquet', index=False)