In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [11]:
# Load your dataset
df = pd.read_csv(r'C:\Users\PC\Desktop\FaresGradProject\listings_rating_clean.csv')
# Features and target
features = ['beds', 'price', 'category', 'livings', 'wc', 'area', 'ketchen', 'furnished', 'city', 'district', 'width', 'length']
X = df[features]
y = df['user.review']
# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['beds', 'price', 'category', 'livings', 'wc', 'area', 'ketchen', 'furnished', 'width', 'length']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['city', 'district'])
    ])
# Create a pipeline for preprocessing
pipeline = Pipeline([
    ('preprocessor', preprocessor)
])
# Apply preprocessing to the entire dataset
X_processed = pipeline.fit_transform(X)
# Clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(X_processed)  # Fit and predict clusters for the whole dataset
# Now, you can split the data if needed for further modeling or training
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [12]:
for cluster in range(n_clusters):
    cluster_data = df[df['cluster'] == cluster]
    X_cluster = cluster_data.drop(['user.review', 'cluster'], axis=1)
    y_cluster = cluster_data['user.review']
    
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))
    ])

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the grid of hyperparameters
grid = {
    'n_estimators': [80, 100, 120, 140, 160],
    'max_features': ['sqrt'],
    'max_depth': [24, 26, 28, 30, 32],
    'criterion': ['friedman_mse'],
    'random_state': [0]
}

# Initialize an empty dictionary to store the best hyperparameters and scores for each cluster
best_params_per_cluster = {}
best_scores_per_cluster = {}

# Iterate over each cluster
for cluster in range(n_clusters):
    cluster_data = df[df['cluster'] == cluster]
    X_cluster = cluster_data.drop(['user.review', 'cluster'], axis=1)
    y_cluster = cluster_data['user.review']
    
    # Split the data into training and testing sets
    X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cluster, y_cluster, test_size=0.2, random_state=42)
    
    # Define the preprocessing pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor)
    ])
    
    # Apply preprocessing to the training data
    X_train_c_processed = pipeline.fit_transform(X_train_c)
    
    # Initialize GridSearchCV for the current cluster
    gs = GridSearchCV(estimator=RandomForestRegressor(), param_grid=grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
    
    # Fit GridSearchCV to find the best hyperparameters for the current cluster
    gs.fit(X_train_c_processed, y_train_c)
    
    # Store the best hyperparameters and scores for the current cluster
    best_params_per_cluster[cluster] = gs.best_params_
    best_scores_per_cluster[cluster] = gs.best_score_

    print(f'Cluster {cluster} - Best MSE: {gs.best_score_}, Best Parameters: {gs.best_params_}')

# Print the best hyperparameters and scores for each cluster
for cluster, params in best_params_per_cluster.items():
    print(f'Cluster {cluster} - Best MSE: {best_scores_per_cluster[cluster]}, Best Parameters: {params}')


Cluster 0 - Best MSE: -0.23268822185415453, Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 32, 'max_features': 'sqrt', 'n_estimators': 140, 'random_state': 0}
Cluster 1 - Best MSE: -0.23342077564376046, Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 32, 'max_features': 'sqrt', 'n_estimators': 160, 'random_state': 0}
Cluster 2 - Best MSE: -0.22422126930418, Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 32, 'max_features': 'sqrt', 'n_estimators': 160, 'random_state': 0}
Cluster 3 - Best MSE: -0.2353861919699745, Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 32, 'max_features': 'sqrt', 'n_estimators': 160, 'random_state': 0}
Cluster 4 - Best MSE: -0.27211600919023116, Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 32, 'max_features': 'sqrt', 'n_estimators': 160, 'random_state': 0}
Cluster 0 - Best MSE: -0.23268822185415453, Best Parameters: {'criterion': 'friedman_mse', 'max_depth': 32, 'max_features': 'sqrt', 'n_estimator