In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
# Load dataset
data = pd.read_csv('/content/drive/MyDrive/LY Project/Used Datasets/preferences.csv')

# Label encoding for categorical variables
label_encoder = LabelEncoder()
categorical_columns = ['Gender', 'Location', 'Habits', 'FoodPreference', 'Profession', 'Religion', 'SleepSchedule', 'CleanlinessHabits']
for column in categorical_columns:
    data[column] = label_encoder.fit_transform(data[column])


In [None]:
data.head()

Unnamed: 0,ID,First Name,Last Name,Gender,Location,FullName,Age,Habits,FoodPreference,Profession,Religion,SleepSchedule,CleanlinessHabits
0,1,Dani,Mcleod,0,0,Dani Mcleod,21,0,1,0,1,0,0
1,2,Marvin,Allen,1,0,Marvin Allen,37,1,0,1,1,0,2
2,3,Moira,Carpenter,0,0,Moira Carpenter,36,0,1,1,1,0,1
3,4,Rose,Weldon,0,0,Rose Weldon,38,1,1,0,0,1,0
4,5,Rufus,Warden,1,0,Rufus Warden,35,0,1,1,1,1,2


In [None]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Define features and target variable
features = categorical_columns + ['Age']
# Find the target column dynamically
target = [col for col in data.columns if col not in features][0]

# Train a Random Forest Regressor to predict compatibility
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(train_data[features], train_data[target])

# Predict compatibility for the test set
test_data['Predicted Compatibility'] = rf_regressor.predict(test_data[features])

# Calculate similarity using cosine similarity
similarity_matrix = cosine_similarity(test_data[features])
similarity_df = pd.DataFrame(similarity_matrix, index=test_data.index, columns=test_data.index)


In [None]:
# Get top N similar users for each user
top_n = 10
top_similar_users = {}
for user_id in test_data.index:
    similar_users = similarity_df.loc[user_id].sort_values(ascending=False).index[1:top_n+1]
    top_similar_users[user_id] = list(similar_users)

# Display top similar users for each user
for user_id, similar_users in top_similar_users.items():
    print(f"Top {top_n} similar users for User {user_id}: {similar_users}")

# Evaluate the model
mse = mean_squared_error(test_data[target], test_data['Predicted Compatibility'])
r2 = r2_score(test_data[target], test_data['Predicted Compatibility'])

print(f"\nMean Squared Error: {mse}")
print(f"R-squared: {r2}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Top 10 similar users for User 27505: [43940, 61658, 75602, 9367, 43178, 18412, 48809, 41586, 64549, 25445]
Top 10 similar users for User 77308: [13954, 77695, 23079, 25867, 39306, 39342, 32877, 77308, 73185, 28659]
Top 10 similar users for User 72107: [57525, 72107, 61556, 72820, 28010, 12528, 38198, 60779, 54533, 31507]
Top 10 similar users for User 63571: [63571, 11925, 65837, 73602, 55644, 20569, 22040, 23678, 24084, 45523]
Top 10 similar users for User 63485: [50749, 23875, 22529, 33403, 65378, 51633, 54642, 47243, 58462, 37581]
Top 10 similar users for User 53541: [37212, 32722, 23158, 25142, 1817, 5907, 35766, 38266, 56820, 65402]
Top 10 similar users for User 64697: [64697, 42258, 6884, 33253, 45932, 10735, 8355, 45195, 52793, 15320]
Top 10 similar users for User 9767: [3011, 32594, 50068, 16873, 70883, 25647, 15323, 51597, 62082, 5501]
Top 10 similar users for User 23676: [23676, 39287, 10032, 52453, 66176, 60400,

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=3, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the data
grid_search.fit(train_data[features], train_data[target])

# Get the best model from Grid Search
best_rf_model = grid_search.best_estimator_

# Predict compatibility for the test set using the best model
test_data['Predicted Compatibility'] = best_rf_model.predict(test_data[features])

# Evaluate the model
mse = mean_squared_error(test_data[target], test_data['Predicted Compatibility'])
r2 = r2_score(test_data[target], test_data['Predicted Compatibility'])

print(f"\nMean Squared Error (after hyperparameter tuning): {mse}")
print(f"R-squared (after hyperparameter tuning): {r2}")



Mean Squared Error (after hyperparameter tuning): 811251223.3084964
R-squared (after hyperparameter tuning): 0.0015042609207730484
