In [45]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')
from sklearn.decomposition import PCA


In [46]:
data = pd.read_csv('D:\MY WORK\DATA SCIENCE PROJECT\SOCIAL MEDIA RECOMMENDATION\SocialMediaUsersDataset.csv')
data.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela


In [47]:
data.isnull().sum()

UserID       0
Name         0
Gender       0
DOB          0
Interests    0
City         0
Country      0
dtype: int64

In [48]:
data.isna().sum()

UserID       0
Name         0
Gender       0
DOB          0
Interests    0
City         0
Country      0
dtype: int64

In [49]:
data.describe()

Unnamed: 0,UserID
count,100000.0
mean,50000.5
std,28867.657797
min,1.0
25%,25000.75
50%,50000.5
75%,75000.25
max,100000.0


In [50]:
data['DOB'] = pd.to_datetime(data['DOB'])
data['Age'] = (pd.Timestamp.now() - data['DOB']).dt.days // 365

In [51]:
data.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country,Age
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia,66
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya,20
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan,25
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy,39
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela,69


In [52]:
scaler = MinMaxScaler()
data['Age'] = scaler.fit_transform(data[['Age']])
data.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country,Age
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia,0.901961
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya,0.0
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan,0.098039
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy,0.372549
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela,0.960784


In [53]:
encoder = OneHotEncoder(sparse_output=False)
Gender_encoded = encoder.fit_transform(data[['Gender']])
gender_column = encoder.get_feature_names_out(['Gender'])
gender_df = pd.DataFrame(Gender_encoded,columns=gender_column)
gender_df.index = data.index
data = pd.concat([data, gender_df], axis=1)
data.head()

Unnamed: 0,UserID,Name,Gender,DOB,Interests,City,Country,Age,Gender_Female,Gender_Male
0,1,Jesse Lawhorn,Female,1958-10-15,"'Movies', 'Fashion', 'Fashion', 'Books'",Sibolga,Indonesia,0.901961,1.0,0.0
1,2,Stacy Payne,Female,2004-07-21,"'Gaming', 'Finance and investments', 'Outdoor ...",Al Abyār,Libya,0.0,1.0,0.0
2,3,Katrina Nicewander,Female,2000-02-07,"'DIY and crafts', 'Music', 'Science', 'Fashion'",Wādī as Sīr,Jordan,0.098039,1.0,0.0
3,4,Eric Yarbrough,Male,1985-04-14,"'Outdoor activities', 'Cars and automobiles'",Matera,Italy,0.372549,0.0,1.0
4,5,Daniel Adkins,Female,1955-09-18,"'Politics', 'History'",Biruaca,Venezuela,0.960784,1.0,0.0


In [54]:
vectorizer = TfidfVectorizer(stop_words='english',max_features=500,ngram_range=(1,2),max_df=0.8,min_df=5)
interest_vectorized = vectorizer.fit_transform(data['Interests'])
interest_array = interest_vectorized.toarray()
interest_array = scaler.fit_transform(interest_array)
interest_array.shape

(100000, 500)

In [55]:
pca = PCA(n_components=50)
interest_array_pca = pca.fit_transform(interest_array)

In [56]:
user_profile = np.hstack([
    data['Age'].values.reshape(-1,1)*0.2,
    data['Gender_Male'].values.reshape(-1,1)*0.1,
    data['Gender_Female'].values.reshape(-1,1)*0.1,
    interest_array_pca*0.3
])

In [57]:
'''knn =  NearestNeighbors(n_neighbors=30,algorithm='brute',metric='cosine',n_jobs=-1)
knn.fit(user_profile)

distance, indices = knn.kneighbors(user_profile) '''


"knn =  NearestNeighbors(n_neighbors=30,algorithm='brute',metric='cosine',n_jobs=-1)\nknn.fit(user_profile)\n\ndistance, indices = knn.kneighbors(user_profile) "

In [106]:
def recommend_user(user_index,user_profile,min_similarity=0.8,top_n=5):
    user_vector = user_profile[user_index].reshape(1,-1)
    user_similarity = cosine_similarity(user_vector,user_profile).flatten()
    similar_users = [(i,user_similarity[i]) for i in range(len(user_similarity)) if i!=user_index]
    similar_users = [users for users in similar_users if users[1]>min_similarity]
    similar_users.sort(key=lambda x:x[1],reverse=True)
    top_similar_users = similar_users[:top_n]


    similar_users_info = []
    for idx,similarity in top_similar_users:
        user_info = data.iloc[idx].to_dict()
        user_info['similarity'] = round(similarity,4)
        similar_users_info.append(user_info)
        #avg_similarity = np.mean(user_info['similarity'])
        #print(f"Average Similarity: {avg_similarity}")


    if similar_users_info:
        recommendations_df = pd.DataFrame(similar_users_info)
    else:
        recommendations_df = pd.DataFrame(columns=data.columns.tolist() + ['Similarity'])
    avg_similarity = np.mean([sim[1] for sim in top_similar_users]) if top_similar_users else 0

    return recommendations_df,round(avg_similarity,4)
    


In [107]:
user_index = 100
recommendations_df, avg_sim = recommend_user(user_index, user_profile, min_similarity=0.8, top_n=5)

print("Similar Users DataFrame:")
print(recommendations_df)
print("\nAverage Similarity Score:", avg_sim)

Similar Users DataFrame:
   UserID              Name  Gender        DOB  \
0    3948    Douglas Pierce  Female 1982-04-16   
1   56132       Marlyn Tull  Female 1956-04-21   
2   53268  Idella Demetriou  Female 1984-04-06   
3    1138     Tommy Watkins  Female 1978-04-29   
4   54616     Kelly Sanchez  Female 1978-07-17   

                        Interests        City        Country       Age  \
0  'Music', 'Health and wellness'      Toulon         France  0.431373   
1  'Music', 'Health and wellness'     Roswell  United States  0.941176   
2  'Music', 'Health and wellness'    Tasbuget     Kazakhstan  0.392157   
3  'Health and wellness', 'Music'  Bode Saadu        Nigeria  0.509804   
4  'Health and wellness', 'Music'   Riihimäki        Finland  0.509804   

   Gender_Female  Gender_Male  similarity  
0            1.0          0.0      0.9954  
1            1.0          0.0      0.9945  
2            1.0          0.0      0.9937  
3            1.0          0.0      0.9906  
4        

In [112]:
user_index=1000
recommend_user(user_index,user_profile,min_similarity=0.8,top_n=5)

(   UserID             Name  Gender        DOB                   Interests  \
 0   28201    Perla Bittner  Female 1961-09-20  'Pets', 'Art', 'Gardening'   
 1   42448    Solomon Cosby  Female 1956-06-20  'Gardening', 'Art', 'Pets'   
 2   50647    Evelyn Nelson  Female 1963-03-10        'Art', 'Art', 'Pets'   
 3   21329  Stanley Edwards  Female 1961-11-14               'Pets', 'Art'   
 4    4774     Irene Hunter  Female 1963-03-23               'Pets', 'Art'   
 
                    City Country       Age  Gender_Female  Gender_Male  \
 0               Zhongba   China  0.843137            1.0          0.0   
 1               Ajodhya   India  0.941176            1.0          0.0   
 2                 Ikeda   Japan  0.823529            1.0          0.0   
 3                Kasama  Zambia  0.843137            1.0          0.0   
 4  Aparecida do Taboado  Brazil  0.803922            1.0          0.0   
 
    similarity  
 0      0.9718  
 1      0.9669  
 2      0.9488  
 3      0.9346  

In [113]:
def multiple_users(user_indices,user_profile,min_similarity=0.8,top_n=5):
    recommend_users = []
    for user_index in user_indices:
        recommend_users.append(recommend_user(user_index,user_profile,min_similarity,top_n))
    return recommend_users

    
    
user_indices = np.random.choice(len(user_profile),5,replace=False)

In [114]:
multiple_users(user_indices,user_profile,min_similarity=0.8,top_n=5)       

[(   UserID             Name Gender        DOB  \
  0    4805    Dwight Murphy   Male 1962-12-11   
  1    1284        Star Imai   Male 1982-06-22   
  2   68118  William Ehrhart   Male 1967-01-08   
  3   98327       Rita Paulk   Male 1994-12-16   
  4   87088     Robert Blake   Male 1977-11-16   
  
                                             Interests         City  \
  0  'Social causes and activism', 'Nature', 'Techn...      Esbjerg   
  1  'Nature', 'Technology', 'Social causes and act...      Samdari   
  2  'Nature', 'Technology', 'Social causes and act...  Castle Rock   
  3  'Technology', 'Nature', 'Social causes and act...        Massy   
  4  'Nature', 'Technology', 'Nature', 'Social caus...      Tatsuno   
  
           Country       Age  Gender_Female  Gender_Male  similarity  
  0        Denmark  0.823529            0.0          1.0      0.9995  
  1          India  0.431373            0.0          1.0      0.9930  
  2  United States  0.745098            0.0          1.

In [115]:
for i in range(len(user_indices)):
    user_index = user_indices[i]
    recommendations = recommend_user(user_index, user_profile, min_similarity=0.8, top_n=5)
    print(recommendations)


(   UserID             Name Gender        DOB  \
0    4805    Dwight Murphy   Male 1962-12-11   
1    1284        Star Imai   Male 1982-06-22   
2   68118  William Ehrhart   Male 1967-01-08   
3   98327       Rita Paulk   Male 1994-12-16   
4   87088     Robert Blake   Male 1977-11-16   

                                           Interests         City  \
0  'Social causes and activism', 'Nature', 'Techn...      Esbjerg   
1  'Nature', 'Technology', 'Social causes and act...      Samdari   
2  'Nature', 'Technology', 'Social causes and act...  Castle Rock   
3  'Technology', 'Nature', 'Social causes and act...        Massy   
4  'Nature', 'Technology', 'Nature', 'Social caus...      Tatsuno   

         Country       Age  Gender_Female  Gender_Male  similarity  
0        Denmark  0.823529            0.0          1.0      0.9995  
1          India  0.431373            0.0          1.0      0.9930  
2  United States  0.745098            0.0          1.0      0.9925  
3         France  0