In [7]:
import pandas as pd
import numpy as np
import random

# ✅ Step 1: Load existing dataset
df_real = pd.read_csv("../data/dataset.csv")

# ✅ Step 2: Define possible values (same as your form options)
skills_list = [
    "Photography", "Videography", "Graphic Designing", "Animation",
    "Music Production", "UI/UX", "Content Creation", "Advertising", "Marketing"
]
skill_levels = ["Beginner", "Intermediate", "Advanced"]
content_types = ["Video tutorials", "Written guides", "Interactive activities"]
time_commitments = ["< 1 hour", "1–3 hours", "3–5 hours", "5+ hours"]

# ✅ Step 3: Generate synthetic users
synthetic_data = []

for i in range(100):  # generate 100 fake users
    skills = random.sample(skills_list, random.randint(1, 5))
    interests = random.sample(skills_list, random.randint(1, 5))
    user = {
        "user_id": 10000 + i,
        "username": f"testuser_{i}",
        "skills": ",".join(skills),
        "location": f"City_{random.randint(1, 50)}",
        "interests": ",".join(interests),
        "skillLevel": random.choice(skill_levels),
        "contentType": random.choice(content_types),
        "timeCommitment": random.choice(time_commitments)
    }
    synthetic_data.append(user)

df_synthetic = pd.DataFrame(synthetic_data)

# ✅ Step 4: Combine real + synthetic data
df_combined = pd.concat([df_real, df_synthetic], ignore_index=True)

# ✅ Step 5: Save or preview
df_combined.to_csv("../data/dataset_augmented.csv", index=False)
print(df_combined.tail())


     user_id     username                                             skills  \
115    10095  testuser_95             Animation,Music Production,Photography   
116    10096  testuser_96                                              UI/UX   
117    10097  testuser_97                                    UI/UX,Marketing   
118    10098  testuser_98  UI/UX,Content Creation,Graphic Designing,Photo...   
119    10099  testuser_99  Animation,Graphic Designing,Advertising,Music ...   

    location                                       interests    skillLevel  \
115  City_46              Videography,Content Creation,UI/UX  Intermediate   
116  City_20  Music Production,Videography,UI/UX,Photography  Intermediate   
117  City_36      Marketing,Animation,Music Production,UI/UX      Advanced   
118  City_14                                           UI/UX      Advanced   
119  City_32                                     Videography      Beginner   

                contentType timeCommitment  
115  

Unnamed: 0,user_id,username,skills,location,interests,skillLevel,contentType,timeCommitment
0,25,dev,,qwe,,,,
1,26,MOMMYYYYY,,qwe,,,,
2,27,Janu,"[""Graphic Designing""]",qwe,,,,
3,28,hastha,"[""Photography"";""UI/UX"";""Music Production""]",qwe,,,,
4,30,BANU,"[""Photography"";""Music Production"";""UI/UX""]",qwe,,,,


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ✅ Example user profile
user_profile = "Photography, Animation, Graphic Designing"

# ✅ Example list of posts
posts = [
    "Photography, Videography, Content Creation",
    "Music Production, Advertising, Marketing",
    "Graphic Designing, Animation, UI/UX"
]

# ✅ Combine everything for vectorization
documents = [user_profile] + posts

# ✅ Vectorize text
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# ✅ Calculate similarity
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

# ✅ Recommend top posts
top_indices = similarity_scores.argsort()[0][::-1]
print("Recommended post rankings:")
for index in top_indices:
    print(f"Post {index+1}: {posts[index]} → Similarity score: {similarity_scores[0][index]:.2f}")


Recommended post rankings:
Post 3: Graphic Designing, Animation, UI/UX → Similarity score: 0.60
Post 1: Photography, Videography, Content Creation → Similarity score: 0.21
Post 2: Music Production, Advertising, Marketing → Similarity score: 0.00


In [11]:
df_users = pd.read_csv("../data/dataset_augmented.csv")
df_posts = pd.read_csv("../data/posts.csv")


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ✅ STEP 1: Select any user from your dataset
current_user_id = 10005   # ✅ CHANGE this to any real or synthetic user id from dataset_augmented.csv

# ✅ STEP 2: Get user profile
user_row = df_users[df_users['user_id'] == current_user_id].iloc[0]
user_profile = f"{user_row['skills']}, {user_row['interests']}"

# ✅ STEP 3: Prepare post profiles (use your real posts.csv columns!)
df_posts['post_profile'] = df_posts['tags'].fillna('') + ", " + df_posts['category'].fillna('') + ", " + df_posts['skillLevel'].fillna('')
posts = df_posts['post_profile'].tolist()

# ✅ STEP 4: Combine user + posts
documents = [user_profile] + posts

# ✅ STEP 5: Vectorize
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# ✅ STEP 6: Calculate similarity
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

# ✅ STEP 7: Show Top 5 Recommended Posts
top_indices = similarity_scores.argsort()[0][::-1]

print(f"\nRecommended posts for User ID {current_user_id}:")
for index in top_indices[:5]:    # show top 5
    post = df_posts.iloc[index]
    print(f"Post ID {post['post_id']} → Title: {post['title']} → Tags: {post['tags']} → Similarity: {similarity_scores[0][index]:.2f}")



Recommended posts for User ID 10005:
Post ID 3 → Title: hghjgj → Tags: bvnb.;jl → Similarity: 0.10
Post ID 6 → Title: new post  → Tags: sdfbmdsbfmsnf → Similarity: 0.00
Post ID 5 → Title: hfjehfjke → Tags: egfjehr → Similarity: 0.00
Post ID 4 → Title: dfs → Tags: edfs → Similarity: 0.00
Post ID 2 → Title: How to lean photography  → Tags: photos;wildlife → Similarity: 0.00


In [15]:
pip install fastapi uvicorn scikit-learn pandas



SyntaxError: invalid syntax (4203269226.py, line 1)