<a href="https://colab.research.google.com/github/7jadhavAbhi7/Content-Recommendation-/blob/master/Untitled39.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


Content Recommendation System
===============================
Task: Recommend top 3 posts for each user based on profile interests,
      past engagement, and content attributes.

Dataset Structure:
- Users: user_id, age, gender, top_3_interests, past_engagement_score
- Posts: post_id, creator_id, content_type, tags
- Engagements: user_id, post_id, engagement (binary: 1=engaged, 0=not engaged)

# SECTION 1: SETUP AND IMPORTS

In [32]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("CONTENT RECOMMENDATION SYSTEM".center(70))
print("="*70)
print("\n✓ Libraries imported successfully!\n")

                    CONTENT RECOMMENDATION SYSTEM                     

✓ Libraries imported successfully!



#Upload Data Files

In [33]:
from google.colab import files

print("Please upload your 3 CSV files:")
print("1. Users.csv")
print("2. Posts.csv")
print("3. Engagements.csv\n")

uploaded = files.upload()
print("\n✓ Files uploaded successfully!")

Please upload your 3 CSV files:
1. Users.csv
2. Posts.csv
3. Engagements.csv



Saving Users.csv to Users (1).csv
Saving Engagements.csv to Engagements (1).csv
Saving Posts.csv to Posts (1).csv

✓ Files uploaded successfully!


#Load and Explore Data

In [34]:
print("="*70)
print("LOADING DATA".center(70))
print("="*70)

users_df = pd.read_csv('Users.csv')
posts_df = pd.read_csv('Posts.csv')
engagements_df = pd.read_csv('Engagements.csv')

print(f"\n📊 Dataset Statistics:")
print(f"   • Users: {len(users_df)} users")
print(f"   • Posts: {len(posts_df)} posts")
print(f"   • Engagements: {len(engagements_df)} records")

total_positive_engagements = engagements_df['engagement'].sum()
engagement_rate = total_positive_engagements / len(engagements_df) * 100

print(f"\n📈 Engagement Overview:")
print(f"   • Total positive engagements: {total_positive_engagements}")
print(f"   • Overall engagement rate: {engagement_rate:.2f}%")
print(f"   • Avg engagements per user: {total_positive_engagements/len(users_df):.2f}")


# Show samples
print("Users Sample:")
display(users_df.head(3))

print("Posts Sample:")
display(posts_df.head(3))

print("Engagements Sample:")
display(engagements_df.head(6))

                             LOADING DATA                             

📊 Dataset Statistics:
   • Users: 50 users
   • Posts: 100 posts
   • Engagements: 1000 records

📈 Engagement Overview:
   • Total positive engagements: 497
   • Overall engagement rate: 49.70%
   • Avg engagements per user: 9.94

👥 Users Sample:


Unnamed: 0,user_id,age,gender,top_3_interests,past_engagement_score
0,U1,24,F,"sports, art, gaming",0.61
1,U2,32,F,"travel, food, fashion",0.93
2,U3,28,Other,"sports, travel, fashion",0.4



📝 Posts Sample:


Unnamed: 0,post_id,creator_id,content_type,tags
0,P1,U44,video,"sports, food"
1,P2,U26,video,"music, travel"
2,P3,U32,text,"sports, travel"



💬 Engagements Sample:


Unnamed: 0,user_id,post_id,engagement
0,U1,P52,1
1,U1,P44,0
2,U1,P1,1
3,U1,P4,1
4,U1,P65,0
5,U1,P32,1


#Data Processsing

In [35]:
print("="*70)
print("DATA PREPROCESSING".center(70))
print("="*70)

# Handle missing values
users_df = users_df.fillna({'top_3_interests': '', 'past_engagement_score': 0.5})
posts_df = posts_df.fillna({'tags': ''})

# Standardize column names
users_df.columns = users_df.columns.str.strip()
posts_df.columns = posts_df.columns.str.strip()
engagements_df.columns = engagements_df.columns.str.strip()

# Create combined features for posts
posts_df['combined_features'] = (
    posts_df['content_type'].astype(str) + ' ' +
    posts_df['tags'].astype(str)
)

print("Data preprocessing complete!")
print(f"Missing values handled")
print(f"Combined features created")
print(posts_df.head())

                          DATA PREPROCESSING                          
✓ Data preprocessing complete!
✓ Missing values handled
✓ Combined features created
  post_id creator_id content_type            tags    combined_features
0      P1        U44        video    sports, food   video sports, food
1      P2        U26        video   music, travel  video music, travel
2      P3        U32         text  sports, travel  text sports, travel
3      P4         U6        image   music, gaming  image music, gaming
4      P5        U32        image   food, fashion  image food, fashion


#Feature Engineering

In [36]:
print("="*70)
print("FEATURE ENGINEERING".center(70))
print("="*70)

# Create user-post engagement matrix
positive_engagements = engagements_df[engagements_df['engagement'] == 1].copy()

user_post_matrix = positive_engagements.pivot_table(
    index='user_id',
    columns='post_id',
    values='engagement',
    fill_value=0
)

# Add missing users
all_users = users_df['user_id'].unique()
missing_users = set(all_users) - set(user_post_matrix.index)
for user in missing_users:
    user_post_matrix.loc[user] = 0

user_post_matrix = user_post_matrix.sort_index()

print(f"✓ User-Post Matrix: {user_post_matrix.shape}")
print(f"✓ Sparsity: {(1 - user_post_matrix.sum().sum() / user_post_matrix.size) * 100:.2f}%")

# TF-IDF for content
tfidf = TfidfVectorizer(max_features=50, stop_words='english', ngram_range=(1, 2))
post_tfidf_matrix = tfidf.fit_transform(posts_df['combined_features'])

print(f"✓ TF-IDF Matrix: {post_tfidf_matrix.shape}")

# Post similarity matrix
post_similarity = cosine_similarity(post_tfidf_matrix)
post_similarity_df = pd.DataFrame(
    post_similarity,
    index=posts_df['post_id'],
    columns=posts_df['post_id']
)

print("✓ Feature engineering complete!")

                         FEATURE ENGINEERING                          
✓ User-Post Matrix: (50, 97)
✓ Sparsity: 89.75%
✓ TF-IDF Matrix: (100, 50)
✓ Feature engineering complete!


# Helper Function

In [37]:
def interest_based_score(user_id, post_id, users_df, posts_df):
    """Score posts based on user interest overlap"""
    user_row = users_df[users_df['user_id'] == user_id].iloc[0]
    post_row = posts_df[posts_df['post_id'] == post_id].iloc[0]

    user_interests = set(user_row['top_3_interests'].lower().split(','))
    user_interests = {i.strip() for i in user_interests if i.strip()}

    post_tags = set(post_row['tags'].lower().split(','))
    post_tags = {t.strip() for t in post_tags if t.strip()}

    if not user_interests:
        return 0

    overlap = len(user_interests & post_tags)
    return overlap / len(user_interests)

print("✓ Helper functions defined")

✓ Helper functions defined


# Collaborative Filtering Model

In [38]:
def collaborative_filtering_scores(user_id, user_post_matrix, top_k_users=10):
    """Generate scores using collaborative filtering"""
    if user_id not in user_post_matrix.index:
        return {}

    # Calculate user similarity
    user_similarity = cosine_similarity(user_post_matrix)
    user_sim_df = pd.DataFrame(
        user_similarity,
        index=user_post_matrix.index,
        columns=user_post_matrix.index
    )

    # Get similar users
    similar_users = user_sim_df[user_id].sort_values(ascending=False)[1:top_k_users+1]

    # Get user's engaged posts
    user_engagements = user_post_matrix.loc[user_id]
    engaged_posts = set(user_engagements[user_engagements > 0].index)

    # Calculate scores
    scores = {}
    for post_id in user_post_matrix.columns:
        if post_id not in engaged_posts:
            score = 0
            for sim_user, similarity in similar_users.items():
                if similarity > 0:
                    score += similarity * user_post_matrix.loc[sim_user, post_id]
            scores[post_id] = score

    return scores

print("Collaborative filtering model defined")

✓ Collaborative filtering model defined


# Content Based Model

In [39]:
def content_based_scores(user_id, user_post_matrix, posts_df, post_similarity_df):
    """Generate scores using content-based filtering"""
    if user_id not in user_post_matrix.index:
        return {}

    # Get engaged posts
    user_engagements = user_post_matrix.loc[user_id]
    engaged_posts = user_engagements[user_engagements > 0].index.tolist()

    if not engaged_posts:
        return {}

    # Calculate similarity scores
    scores = {}
    for post_id in posts_df['post_id']:
        if post_id not in engaged_posts and post_id in post_similarity_df.columns:
            similarities = [post_similarity_df.loc[engaged, post_id]
                          for engaged in engaged_posts
                          if engaged in post_similarity_df.index]
            if similarities:
                scores[post_id] = np.mean(similarities)

    return scores

print("✓ Content-based filtering model defined")

✓ Content-based filtering model defined


# Hybrid Based Model

In [40]:
def hybrid_recommendations(user_id, users_df, posts_df, user_post_matrix,
                          post_similarity_df, weights=None, top_n=3):
    """Generate hybrid recommendations"""
    if weights is None:
        weights = {
            'interest': 0.40,
            'collaborative': 0.30,
            'content': 0.25,
            'engagement_history': 0.05
        }

    # Get engaged posts
    if user_id in user_post_matrix.index:
        user_engagements = user_post_matrix.loc[user_id]
        engaged_posts = set(user_engagements[user_engagements > 0].index)
    else:
        engaged_posts = set()

    # Get user engagement score
    user_row = users_df[users_df['user_id'] == user_id].iloc[0]
    user_eng_score = user_row['past_engagement_score']

    # Get scores from each model
    cf_scores = collaborative_filtering_scores(user_id, user_post_matrix)
    cb_scores = content_based_scores(user_id, user_post_matrix, posts_df, post_similarity_df)

    # Normalize scores
    def normalize_scores(scores_dict):
        if not scores_dict:
            return {}
        values = list(scores_dict.values())
        min_val, max_val = min(values), max(values)
        if max_val == min_val:
            return {k: 0.5 for k in scores_dict}
        return {k: (v - min_val) / (max_val - min_val) for k, v in scores_dict.items()}

    cf_scores_norm = normalize_scores(cf_scores)
    cb_scores_norm = normalize_scores(cb_scores)

    # Combine scores
    candidate_posts = posts_df['post_id'].tolist()
    post_scores = {}

    for post_id in candidate_posts:
        if post_id in engaged_posts:
            continue

        interest_score = interest_based_score(user_id, post_id, users_df, posts_df)
        cf_score = cf_scores_norm.get(post_id, 0)
        cb_score = cb_scores_norm.get(post_id, 0)

        combined_score = (
            weights['interest'] * interest_score +
            weights['collaborative'] * cf_score +
            weights['content'] * cb_score +
            weights['engagement_history'] * user_eng_score
        )

        post_scores[post_id] = combined_score

    # Sort and return top N
    sorted_posts = sorted(post_scores.items(), key=lambda x: x[1], reverse=True)
    return [post_id for post_id, score in sorted_posts[:top_n]]

print("✓ Hybrid recommendation system defined")

✓ Hybrid recommendation system defined


# Generate Recommendations

In [41]:
print("="*70)
print("GENERATING RECOMMENDATIONS".center(70))
print("="*70)

recommendations = []

for idx, user_id in enumerate(users_df['user_id'], 1):
    try:
        top_posts = hybrid_recommendations(
            user_id,
            users_df,
            posts_df,
            user_post_matrix,
            post_similarity_df,
            top_n=3
        )

        for rank, post_id in enumerate(top_posts, 1):
            recommendations.append({
                'user_id': user_id,
                'post_id': post_id,
                'rank': rank
            })

        if idx % 5 == 0:
            print(f"Processed {idx}/{len(users_df)} users...", end='\r')

    except Exception as e:
        print(f"Error with user {user_id}: {str(e)}")

recommendations_df = pd.DataFrame(recommendations)

print(f"\n\n✓ Generated {len(recommendations_df)} recommendations!")
print(f"✓ Covered {len(recommendations_df['user_id'].unique())} users")

                      GENERATING RECOMMENDATIONS                      
Processed 50/50 users...

✓ Generated 150 recommendations!
✓ Covered 50 users


# Sample Recommendation

In [42]:
print("="*70)
print("SAMPLE RECOMMENDATIONS".center(70))
print("="*70)

for user in users_df['user_id'].head(5):
    user_recs = recommendations_df[recommendations_df['user_id'] == user]
    user_info = users_df[users_df['user_id'] == user].iloc[0]

    print(f"\n👤 {user} - {user_info['age']}yo {user_info['gender']}")
    print(f"   Interests: {user_info['top_3_interests']}")
    print(f"   Recommendations:")

    for _, row in user_recs.iterrows():
        post_info = posts_df[posts_df['post_id'] == row['post_id']].iloc[0]
        print(f"      {row['rank']}. {row['post_id']} - {post_info['content_type']} ({post_info['tags']})")

                        SAMPLE RECOMMENDATIONS                        

👤 U1 - 24yo F
   Interests: sports, art, gaming
   Recommendations:
      1. P65 - image (sports)
      2. P22 - audio (sports, art)
      3. P50 - image (sports)

👤 U2 - 32yo F
   Interests: travel, food, fashion
   Recommendations:
      1. P53 - video (fashion, tech)
      2. P7 - image (food, fitness)
      3. P1 - video (sports, food)

👤 U3 - 28yo Other
   Interests: sports, travel, fashion
   Recommendations:
      1. P39 - video (travel, sports)
      2. P17 - video (food, sports)
      3. P1 - video (sports, food)

👤 U4 - 25yo M
   Interests: fashion, music, tech
   Recommendations:
      1. P53 - video (fashion, tech)
      2. P51 - image (tech)
      3. P99 - image (gaming, music)

👤 U5 - 24yo M
   Interests: fashion, food, fitness
   Recommendations:
      1. P26 - image (food, fitness)
      2. P70 - image (art, food)
      3. P1 - video (sports, food)


# Calculate Metrics

In [43]:
print("\n" + "="*70)
print("EVALUATION METRICS".center(70))
print("="*70)

# Precision@3
def calculate_precision_at_k(recommendations_df, engagements_df, k=3):
    precisions = []
    for user_id in recommendations_df['user_id'].unique():
        user_recs = recommendations_df[recommendations_df['user_id'] == user_id]['post_id'].head(k).tolist()
        user_positive = engagements_df[
            (engagements_df['user_id'] == user_id) &
            (engagements_df['engagement'] == 1)
        ]['post_id'].tolist()

        hits = len(set(user_recs) & set(user_positive))
        precisions.append(hits / k if k > 0 else 0)

    return np.mean(precisions)

# Coverage
def calculate_coverage(recommendations_df, posts_df):
    recommended = recommendations_df['post_id'].unique()
    total = posts_df['post_id'].unique()
    return len(recommended) / len(total)

# Diversity
def calculate_diversity(recommendations_df, post_similarity_df):
    diversities = []
    for user_id in recommendations_df['user_id'].unique():
        user_recs = recommendations_df[recommendations_df['user_id'] == user_id]['post_id'].tolist()
        if len(user_recs) < 2:
            continue

        sims = []
        for i in range(len(user_recs)):
            for j in range(i+1, len(user_recs)):
                if user_recs[i] in post_similarity_df.index and user_recs[j] in post_similarity_df.columns:
                    sims.append(post_similarity_df.loc[user_recs[i], user_recs[j]])

        if sims:
            diversities.append(1 - np.mean(sims))

    return np.mean(diversities) if diversities else 0

# Interest Alignment
def calculate_interest_alignment(recommendations_df, users_df, posts_df):
    alignments = []
    for user_id in recommendations_df['user_id'].unique():
        user_recs = recommendations_df[recommendations_df['user_id'] == user_id]['post_id'].tolist()
        scores = [interest_based_score(user_id, pid, users_df, posts_df) for pid in user_recs]
        if scores:
            alignments.append(np.mean(scores))
    return np.mean(alignments)

# Calculate all metrics
precision = calculate_precision_at_k(recommendations_df, engagements_df, k=3)
coverage = calculate_coverage(recommendations_df, posts_df)
diversity = calculate_diversity(recommendations_df, post_similarity_df)
alignment = calculate_interest_alignment(recommendations_df, users_df, posts_df)

print(f"\n📊 RESULTS:")
print(f"   • Precision@3: {precision:.4f} ({precision*100:.2f}%)")
print(f"   • Coverage: {coverage:.4f} ({coverage*100:.2f}%)")
print(f"   • Diversity: {diversity:.4f}")
print(f"   • Interest Alignment: {alignment:.4f} ({alignment*100:.2f}%)")


                          EVALUATION METRICS                          

📊 RESULTS:
   • Precision@3: 0.0000 (0.00%)
   • Coverage: 0.5600 (56.00%)
   • Diversity: 0.7637
   • Interest Alignment: 0.4244 (42.44%)


# Save Results

In [44]:
print("\n" + "="*70)
print("SAVING RESULTS".center(70))
print("="*70)

# Save main recommendations
recommendations_df.to_csv('recommendations_output.csv', index=False)
print("✓ Saved: recommendations_output.csv")

# Save detailed version
detailed_recs = recommendations_df.merge(
    posts_df[['post_id', 'content_type', 'tags']], on='post_id', how='left'
).merge(
    users_df[['user_id', 'top_3_interests']], on='user_id', how='left'
)
detailed_recs.to_csv('recommendations_detailed.csv', index=False)
print("✓ Saved: recommendations_detailed.csv")

# Save summary
summary = pd.DataFrame([{
    'total_users': len(users_df),
    'total_posts': len(posts_df),
    'precision_at_3': precision,
    'coverage': coverage,
    'diversity': diversity,
    'interest_alignment': alignment
}])
summary.to_csv('recommendation_summary.csv', index=False)
print("✓ Saved: recommendation_summary.csv")

print("\n✅ ALL DONE! Check your output files.")


                            SAVING RESULTS                            
✓ Saved: recommendations_output.csv
✓ Saved: recommendations_detailed.csv
✓ Saved: recommendation_summary.csv

✅ ALL DONE! Check your output files.


# Download Results

In [45]:
# Download output files
from google.colab import files

print("Downloading output files...")
files.download('recommendations_output.csv')
files.download('recommendations_detailed.csv')
files.download('recommendation_summary.csv')

print("✓ Downloads complete!")

Downloading output files...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✓ Downloads complete!
