In [1]:
#Librerias
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from textblob import TextBlob
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [2]:
# 1.Carga de datos (10% del archivo merged DF's de la combinacion de Yelp business,review y tip)
df = pd.read_csv('rep_sample.csv')

In [21]:
df.head()

Unnamed: 0,business_id,postal_code,latitude,longitude,stars,main category,tip_text,review_text,rev_yelp_sentiment,tip_yelp_sentiment,avg_sentiment,cluster,combined_score,review_sentiment,tip_sentiment
0,cGX-1IUwXOjkUqZbkKYcjw,19107,39.950917,-75.162971,4.0,Restaurants,"Awesome salad bar with good quality cheeses, v...",We went for lunch on a wed afternoon. It defin...,0.194722,1.0,0.597361,0,5.792083,0.194722,1.0
1,ueAkLzWFFTzQkq3jzyBlnA,19107,39.955392,-75.154698,4.0,Restaurants,It's such an addiction..,I actually still love this place...probably be...,0.371469,0.0,0.185735,0,4.557204,0.371469,0.0
2,tMtI6ECD6hwM-nFp2kXLKQ,19107,39.954605,-75.156209,4.0,Food,I love their fresh watermelon drink!,Thanks to bubble tea reviews on Yelp I read ab...,0.382248,0.4375,0.409874,0,5.229622,0.382248,0.4375
3,q-zV08jt6U-q05SMEuQJAQ,19148,39.914107,-75.148727,4.0,Restaurants,Love it!\nLove it!\nGood food,Simply based on the cheesesteak.\n\nCheesestea...,0.2163,0.65,0.43315,0,5.299451,0.2163,0.65
4,EzQpL3jNNGlUzeR6n3uevg,19130,39.968551,-75.174874,3.5,Bars,Delicious pierogi pizza!,"Living nearby, I had always wanted to enjoy Re...",-0.005455,1.0,0.497273,0,4.991818,-0.005455,1.0


In [9]:
# 2. Analisis de Sentimiento review_text and tip_text
def analyze_sentiment(text):
    if pd.isnull(text):
        return 0  # 0 si no hay texto (neutro)
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Aplicar analyze_sentiment
df['review_sentiment'] = df['review_text'].apply(analyze_sentiment)
df['tip_sentiment'] = df['tip_text'].apply(analyze_sentiment)

# 3. Combinar puntajes
df['avg_sentiment'] = df[['review_sentiment', 'tip_sentiment']].mean(axis=1).fillna(0)

In [15]:
# 4. Agrupacion por cluster en Latitud y longitud
X_geo = df[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=5, random_state=42)
df['cluster'] = kmeans.fit_predict(X_geo)

# 5. Combinar puntaje por stars y analisis de sentimiento (multiplicado por 3 para igualar importancia)
df['combined_score'] = df['stars'] + df['avg_sentiment']*3

In [22]:
# 6. Machine Learning Training/Testing
# Variables usadas:
X = df[['latitude', 'longitude', 'stars', 'avg_sentiment', 'postal_code']]
y = df['combined_score']

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 8. Rescalar datos 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 9. Modelo XGBoost para ajuste de hiperparametros
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

model = XGBRegressor(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# 10. Evaluacion del modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


# 11. Contribucion de variables
feature_importances = pd.DataFrame(best_model.feature_importances_, index=X.columns, columns=['importancia']).sort_values('importancia', ascending=False)
print("Evaluacion de importancia:")
print(feature_importances)

Mean Squared Error: 0.00041758801372781493
Evalucion de importancia:
               importancia
stars             0.555262
avg_sentiment     0.444651
postal_code       0.000032
longitude         0.000031
latitude          0.000025


In [23]:
# 12. Agrupar negocios por 'main category' y cluster, calculo de puntaje promedio
df_grouped = df.groupby(['main category', 'cluster']).agg({
    'combined_score': 'mean',   
    'stars': 'mean',            
    'avg_sentiment': 'mean',    
    'business_id': 'count',     
    'latitude': 'mean',         
    'longitude': 'mean',        
    'postal_code': 'first'      
}).reset_index()

# 13. Encontrar las 5 top categorias
top_5_categories = df_grouped.groupby('main category').agg({
    'combined_score': 'mean'   
}).reset_index().nlargest(5, 'combined_score')

top_5_categories = top_5_categories.reset_index(drop=True)

# Imprimir las 5 top categorias en orden de puntaje
print("Top 5 Categorias - Puntaje:")
for i, row in top_5_categories.iterrows():
    print(f"{i+1}. {row['main category']} -  {row['combined_score']:.2f}")

Top 5 Categorias - Puntaje:
1. Pets -  4.93
2. Active Life -  4.79
3. Food -  4.74
4. Shopping -  4.61
5. Beauty & Spas -  4.53


In [25]:
# 14. Distancia Haversine para evitar que las 2 ubicaciones sean iguales
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in kilometers
    d_lat = np.radians(lat2 - lat1)
    d_lon = np.radians(lon2 - lon1)
    a = np.sin(d_lat / 2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(d_lon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c * 1000  # Convert to meters

# 15. Agrupacion de ubicacion segun la restriccion de distancia (min_distance)
def recommend_top_2_locations_for_category_with_distance_constraint(category, min_distance_meters=1000):
    category_data = df_grouped[df_grouped['main category'] == category]
    top_2_locations = category_data.nlargest(5, 'combined_score')  
    
    loc1 = top_2_locations.iloc[0]
    valid_locations = [loc1]
    
    for i in range(1, len(top_2_locations)):
        loc2 = top_2_locations.iloc[i]
        distance = haversine_distance(loc1['latitude'], loc1['longitude'], loc2['latitude'], loc2['longitude'])
        if distance > min_distance_meters:
            valid_locations.append(loc2)
            if len(valid_locations) == 2:
                break
    
    return pd.DataFrame(valid_locations)

# 16. Imprimir 2 posibles ubicaciones
category = input("Ingrese una categoria de las indicadas (e.g., 'Restaurant', 'Coffee Shop', etc.): ")
top_2_locations = recommend_top_2_locations_for_category_with_distance_constraint(category, min_distance_meters=1000)
# Reiniciar los índices para que comiencen en 1 y no en los índices originales
top_2_locations = top_2_locations.reset_index(drop=True)

# Mostrar las ubicaciones con el índice ajustado
print(f"Top 2 Ubicaciones por categoria:{category}:")
top_2_locations.index += 1  # Ajustar el índice para que empiece desde 1
print(top_2_locations[['main category', 'latitude', 'longitude', 'postal_code']])

Top 2 Ubicaciones por categoria:Pets:
  main category   latitude  longitude  postal_code
1          Pets  39.903348 -75.228662        19153
2          Pets  39.942996 -75.152209        19107


____________________

## ANALISIS DE METRICAS

## Error Cuadratico Medio (MSE)

In [31]:
# Calculo Error Cuadratico Medio en Test (MSE)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.00041758801372781493


### OBS: Un bajo valor de MSE implica que las predicciones son cercanas al valor real, buena precision y los datos en la muestra son consistentes puesto que este parametro es muy suceptible a outliers.

## R^2

In [27]:
from sklearn.metrics import r2_score
y_train_pred = best_model.predict(X_train_scaled)
train_r2 = r2_score(y_train, y_train_pred)
print(f"Train R-squared: {train_r2}")

r2 = r2_score(y_test, y_pred)
print(f"Test R-squared: {r2}")

Train R-squared: 0.9996319893288079
Test R-squared: 0.9995957937342933


### OBS: Se pensaria que este modelo en particular esta sobre entrenado ya que el Train R^2 es cercano a 1, sin embargo el Test R^2 es igualmente alto cercano a 1. Al ambos ser similares y proximos a 1, indica que el modelo esta generalizando bien al menos con la muestra tomada para el ensayo que es un 10% al azar del dataframe original que resulta de hacer merge de Yelp business, review y tip.

## Evaluacion por: Validacion Cruzada

In [29]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

# Parametros para la validacion
xgb_model = XGBRegressor(
    n_estimators=50,     # Numero de arboles
    max_depth=3,         # profundidad de arboles
    learning_rate=0.1,   # Tasa de aprendizaje moderado
    random_state=42,     # Aleatoriedad
    n_jobs=-1            # Usar todos los nucleos para el analisis
)

# Desempeno de la validacion K-fold= 3 (cv=3 para optimizar el uso de memoria y tiempo de analisis)
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=3, scoring='r2')

# Cálculo de la desviación estándar
std_dev = np.std(cv_scores)

print(f"Puntaje Cross-validation R²: {cv_scores}")
print(f"Media de la Cross-validation R²: {np.mean(cv_scores)}")
print(f"Desviación estándar de la Cross-validation R²: {std_dev}")

Puntaje Cross-validation R²: [0.99802881 0.99815938 0.99822092]
Media de la Cross-validation R²: 0.998136371919648
Desviación estándar de la Cross-validation R²: 8.009874575524885e-05


### OBS: Metodo usado para evaluar la variabilidad y la confiabilidad de modelos de ML. Entre mas altos y cercanos sean los valores de validacion, mejor el desempeño. Ya que los 3 valores de la validacion son consistentes y similares a lo largo de la muestra, indica que es capaz de generalizar sin estar sobre entrenado y con una baja desviacion estandar.