# Import interaction matrix and project data

In [3]:
import pandas as pd
interaction_matrix_df = pd.read_csv("drive/MyDrive/Recommendation System/real_interaction_matrix.csv")
project_df = pd.read_csv("drive/MyDrive/Recommendation System/devpost_data_update5.csv")
interaction_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4094,4095,4096,4097,4098,4099,4100,4101,4102,4103
0,,,,,,,,,,,...,,,,,,,,,,


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Normalize Interaction Matrix

In [None]:
import numpy as np
interaction_matrix_np = interaction_matrix_df.to_numpy().astype(float)
normalized_click_matrix_row_max = np.copy(interaction_matrix_np)

for i in range(normalized_click_matrix_row_max.shape[0]):
  max_value_row = np.nanmax(normalized_click_matrix_row_max[i, :])
  if max_value_row != 0:
      normalized_click_matrix_row_max[i, :] = np.round(normalized_click_matrix_row_max[i, :] / max_value_row, 2)

# Convert the normalized click matrix to a DataFrame
normalized_click_df = pd.DataFrame(np.where(interaction_matrix_df == 0, np.nan, normalized_click_matrix_row_max))

# Process project feature matrix

In [None]:
project_df = project_df.drop(columns=['description', 'similar_projects', 'clicks_count'])
open_state_mapping = {'open': 2, 'upcoming': 1, 'closed': 0}
project_df['open_state'] = project_df['open_state'].map(open_state_mapping)

# Convert 'featured' to numerical representation (assuming it is a boolean column)
project_df['featured'] = project_df['featured'].astype(int)

# Select columns starting from 'end_date' onwards, along with 'open_state' and 'featured'
feature_columns = ['open_state', 'featured'] + list(project_df.columns[project_df.columns.get_loc('end_date') + 1:])
feature_matrix = project_df[feature_columns]

feature_matrix.head()

Unnamed: 0,open_state,featured,5_AR/VR,23_Beginner Friendly,11_Productivity,6_Machine Learning/AI,2_DevOps,25_Web,21_Enterprise,22_Open Ended,...,3_Fintech,26_Mobile,18_Design,15_Music/Art,27_Robotic Process Automation,12_Lifehacks,14_COVID-19,20_E-commerce/Retail,24_Quantum,8_Voice skills
0,2,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,1,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Generate the project similarity matrix using Pearson correlation

In [None]:
project_similarity = feature_matrix.T.corr()
project_similarity.shape

(4104, 4104)

# Find top 10 similar projects index that is not in closed state

In [None]:
top_100_similar_projects = []

for i in range(project_similarity.shape[0]):
  # Sort similarities for project 'i' in descending order and get indices
  top_similarities = project_similarity.iloc[i].sort_values(ascending=False).index

  # Exclude the project itself and take the next top 10
  top_100 = [project for project in top_similarities if project != i][:100]
  top_100_similar_projects.append(top_100)

# Convert to DataFrame
top_100_similar_df = pd.DataFrame(top_100_similar_projects)
top_100_similar_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,37,2596,165,1470,1426,3115,1399,4073,904,3357,...,1348,1364,3582,1393,3352,1039,54,2618,2001,44
1,588,107,6,1524,2996,3404,2687,2109,1270,687,...,1691,444,3666,77,622,3740,402,2890,2908,1354
2,4098,1674,3552,2890,1853,2659,3847,1549,1833,546,...,45,348,3852,93,85,65,3700,25,2454,26
3,2908,2037,106,1582,1047,2706,587,3843,1010,850,...,3524,196,3648,3478,2690,701,1469,1366,1586,3266
4,4095,3061,1038,1804,1403,754,253,1381,1223,2389,...,2461,2002,1408,2988,1633,3395,620,617,3388,4099
5,402,302,389,910,1195,4022,130,3399,2368,701,...,4098,211,3183,455,1519,16,843,2262,421,248
6,93,4098,3552,1674,588,1668,1524,2687,2996,2109,...,3399,3466,196,2041,1342,739,1428,286,3027,3266
7,3026,2729,869,1762,1164,1363,849,1540,503,895,...,3085,1000,998,3464,3259,1200,3715,1659,1602,1798
8,2906,1377,3650,3096,3092,4085,2568,4100,1752,2743,...,69,987,1099,4058,2832,1121,928,1157,1179,902
9,31,1382,1417,1925,1553,1202,1481,3551,708,1427,...,1577,3182,1595,1424,2923,3090,1685,1477,1451,3024


# Predict the missing clicks for each user using item-based collaborative filtering

In [None]:
import numpy as np
# Drop the nan value for first user
user_clicked = interaction_matrix_df.loc[0].dropna()
predicted_scores_df = pd.DataFrame(index=interaction_matrix_df.index, columns=interaction_matrix_df.columns)
for user_idx in range(interaction_matrix_df.shape[0]):
  user_not_clicked = interaction_matrix_df.loc[user_idx][interaction_matrix_df.loc[user_idx].isna()]
  for missing_click_idx in user_not_clicked.index:
    score = 0
    total = 0

    # Iterate through the indices of the top 10 similar items for the missing item
    for similar_item_idx in top_100_similar_df.iloc[int(missing_click_idx)]:
      # Calculate the score using similarity and user's interaction
      similarity_score = project_similarity.loc[int(missing_click_idx), similar_item_idx]
      # Check if the user interaction is NaN and handle accordingly
      user_interaction = interaction_matrix_df.iloc[0, similar_item_idx]
      if not pd.isna(user_interaction):
        score += similarity_score * user_interaction
        total += 1

    # Avoid division by zero
    average_score = np.round(score / total, 5) if total > 0 else 0
    predicted_scores_df.at[user_idx, missing_click_idx] = average_score


# After getting the predicted score for each user, recommend the best 5 projects index that is not closed yet.

In [None]:
top_5_recommend_projects = []
for i in range(predicted_scores_df.shape[0]):
  sort_predicted_scores_df = predicted_scores_df.iloc[i].sort_values(ascending=False).index
  top_5 = [project for project in sort_predicted_scores_df if int(project) != i and project_df['open_state'].iloc[int(project)]!= 0][:5]
  top_5_recommend_projects.append(top_5)
top_5_recommend_projects_df = pd.DataFrame(top_5_recommend_projects)
top_5_recommend_projects_df.head(10)

Unnamed: 0,0,1,2,3,4
0,588,2594,963,42,514


# Get the name for each project recommended to each user

In [None]:
for i in top_5_recommend_projects_df.index:
  print(f'User{i} recommendation:')
  for j in top_5_recommend_projects_df.loc[i]:
    project_name = project_df['title'].iloc[int(j)]
    print(project_name)


User0 recommendation:
Training Camp 9
Hackadon 2020 - COVID-19 ONLINE EDITION
2. Provinzial Rheinland Hackathon: Future of Insurance
Fintech Collab Hack: TIAA Challenge
Devfest'22 Algiers
