In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
# Load your dataset
df = pd.read_csv(r'C:\Users\PC\Desktop\FaresGradProject\listings_rating_clean.csv')

In [9]:
# Features and target
features = ['beds', 'price', 'category', 'livings', 'wc', 'area', 'ketchen', 'furnished', 'city', 'district', 'width', 'length']
X = df[features]
y = df['user.review']

In [10]:
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['beds', 'price', 'category', 'livings', 'wc', 'area', 'ketchen', 'furnished', 'width', 'length']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['city', 'district'])
    ])

In [11]:
# Create a pipeline for preprocessing
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

In [12]:
# Apply preprocessing to the entire dataset
X_processed = pipeline.fit_transform(X)

In [13]:
# Clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X_processed)  # Fit and predict clusters for the whole dataset

In [14]:
# split the data if needed for further modeling or training
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [16]:
# Random Forest for regression within each cluster
performance_metrics = {}
for cluster in range(n_clusters):
    cluster_data = df[df['cluster'] == cluster]
    X_cluster = cluster_data.drop(['user.review', 'cluster'], axis=1)
    y_cluster = cluster_data['user.review']
    
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))
    ])
    
    pipeline.fit(X_train_c, y_train_c)
    y_pred_c = pipeline.predict(X_test_c)
    mse_c = mean_squared_error(y_test_c, y_pred_c)
    mae_c = mean_absolute_error(y_test_c, y_pred_c)
    performance_metrics[cluster] = (mse_c, mae_c)
    print(f'Cluster {cluster} - Mean Squared Error: {mse_c}, Mean Absolute Error: {mae_c}')


Cluster 0 - Mean Squared Error: 0.2728734259907686, Mean Absolute Error: 0.3497646237142624
Cluster 1 - Mean Squared Error: 0.27228248577873143, Mean Absolute Error: 0.3109835308861272
Cluster 2 - Mean Squared Error: 0.25172781627936586, Mean Absolute Error: 0.3445884112218403
Cluster 3 - Mean Squared Error: 0.27017136192371755, Mean Absolute Error: 0.28878432034640705
Cluster 4 - Mean Squared Error: 0.33087826846184576, Mean Absolute Error: 0.3791248831187151


In [17]:
# Recommendation function (can be refined based on specific criteria)
def recommend_from_cluster(cluster_id, top_n=5):
    cluster_data = df[df['cluster'] == cluster_id]
    top_recommendations = cluster_data.sort_values(by='user.review', ascending=False).head(top_n)
    return top_recommendations

In [20]:
# Example: Get top 5 recommendations from cluster 0
top_properties = recommend_from_cluster(3, top_n=5)
print(top_properties)

            id      price  category  beds  livings   wc   area  ketchen  \
0        40278  5500000.0         7   0.0      0.0  0.0  720.0      0.0   
63305  4861635  1400000.0         9   5.0      1.0  2.0  447.0      0.0   
62592  4857949   720000.0         6   5.0      1.0  4.0  171.0      0.0   
62608  4858008   720000.0         6   6.0      1.0  4.0  230.0      0.0   
62610  4858012   420000.0         6   3.0      1.0  2.0  110.0      0.0   

       furnished  location.lat  location.lng  user.review         city  \
0            0.0     21.408780     39.789310          5.0  مكة المكرمة   
63305        0.0     21.557821     39.767807          5.0  مكة المكرمة   
62592        0.0     21.552203     39.244640          5.0          جدة   
62608        0.0     21.541000     39.288191          5.0          جدة   
62610        0.0     21.550089     39.245586          5.0          جدة   

       city_id     district  district_id  width  length  cluster  
0           94   حي الرصيفة         3

In [24]:
# Evaluate each cluster and select the top 2 properties based on user reviews
top_recommendations = pd.DataFrame()
for cluster_id in range(n_clusters):
    cluster_data = df[df['cluster'] == cluster_id]
    top_properties = cluster_data.nlargest(2, 'user.review')
    top_recommendations = pd.concat([top_recommendations, top_properties])

print(top_recommendations)

          id       price  category  beds  livings   wc    area  ketchen  \
116  2111194  18000000.0         3   7.0      3.0  5.0  2450.0      1.0   
163  2430758    350000.0         3   7.0      3.0  5.0   430.0      1.0   
5     131908   1500000.0         9   5.0      3.0  4.0   290.0      1.0   
6     188508   1800000.0         3   5.0      1.0  2.0   400.0      1.0   
114  2104551   1450000.0         3   5.0      2.0  5.0   312.0      1.0   
166  2441613   3700000.0         3   7.0      0.0  5.0   716.0      1.0   
0      40278   5500000.0         7   0.0      0.0  0.0   720.0      0.0   
29    991955   3000000.0         7   0.0      0.0  0.0   319.0      0.0   
153  2367455    400000.0         9   4.0      1.0  3.0   600.0      1.0   
355  2915796   5000000.0         3   7.0      5.0  5.0  1000.0      1.0   

     furnished  location.lat  location.lng  user.review             city  \
116        0.0     24.753895     46.707722          5.0           الرياض   
163        0.0     24.