<a href="https://colab.research.google.com/github/Aman2568/CODSOFT/blob/main/RecommendationSytemLogic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357245 sha256=0ad6394020466e2ba1050ef454155cb7da9eb471ecdf196927c565aa972d9f6d
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [7]:
from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy

data = Dataset.load_builtin('ml-100k')
#ml-100k The MovieLens 100k dataset (ml-100k) is a popular dataset for building recommendation systems
#It consists of 100,000 ratings from 943 users on 1682 movies , this dataset can be download from kaggle
reader = Reader(line_format='user item rating timestamp', sep='\t')
data = Dataset.load_from_file('/content/ml-100k/u.data', reader=reader) #taking u.data file from the dataset

trainset, testset = train_test_split(data, test_size=0.25)

sim_options = {
    'name': 'cosine',
    'user_based': True
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)

predictions = model.test(testset)
accuracy.rmse(predictions)

def get_top_n_recommendations(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if not top_n.get(uid):
            top_n[uid] = []
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n_recommendations = get_top_n_recommendations(predictions, n=10)
for user_id, user_ratings in top_n_recommendations.items():
    print(f"User {user_id} recommendations:")
    for item_id, rating in user_ratings:
        print(f"Item {item_id} with predicted rating {rating:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Item 318 with predicted rating 4.70
Item 98 with predicted rating 4.55
Item 210 with predicted rating 4.38
Item 191 with predicted rating 4.35
Item 199 with predicted rating 4.28
Item 195 with predicted rating 4.23
Item 95 with predicted rating 4.18
Item 143 with predicted rating 4.18
Item 484 with predicted rating 4.17
Item 527 with predicted rating 4.13
User 313 recommendations:
Item 483 with predicted rating 4.73
Item 427 with predicted rating 4.70
Item 178 with predicted rating 4.60
Item 23 with predicted rating 4.47
Item 654 with predicted rating 4.43
Item 199 with predicted rating 4.43
Item 659 with predicted rating 4.35
Item 183 with predicted rating 4.30
Item 478 with predicted rating 4.25
Item 127 with predicted rating 4.22
User 603 recommendations:
Item 174 with predicted rating 4.50
Item 923 with predicted rating 4.23
Item 180 with predicted rating 4.12
Item 228 with predicted rating 4.05
Item 56 with predicted

In [8]:
!pip install pandas scikit-learn



In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Sample data
data = {
    'movie_id': [1, 2, 3, 4, 5],
    'title': ['Movie A', 'Movie B', 'Movie C', 'Movie D', 'Movie E'],
    'description': [
        'Action packed adventure movie',
        'Romantic comedy with a twist',
        'Sci-fi thriller with aliens',
        'Drama about family relations',
        'Action movie with superheroes'
    ]
}

df = pd.DataFrame(data)

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def recommend_movies(title, cosine_sim=cosine_sim):
    idx = df.index[df['title'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

recommended_movies = recommend_movies('Movie A')
print("Recommended movies for 'Movie A':")
for movie in recommended_movies:
    print(movie)

Recommended movies for 'Movie A':
Movie E
Movie B
Movie C


***Another Method***

In [13]:
!pip install scikit-surprise

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from surprise import accuracy
import pandas as pd


ratings_columns = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=ratings_columns, engine='python')

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'item_id', 'rating']], reader)


trainset, testset = train_test_split(data, test_size=0.25)


sim_options = {
    'name': 'cosine',
    'user_based': True
}
model = KNNBasic(sim_options=sim_options)
model.fit(trainset)


predictions = model.test(testset)
accuracy.rmse(predictions)

def get_top_n_recommendations(predictions, n=10):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if not top_n.get(uid):
            top_n[uid] = []
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n_recommendations = get_top_n_recommendations(predictions, n=10)
for user_id, user_ratings in top_n_recommendations.items():
    print(f"User {user_id} recommendations:")
    for item_id, rating in user_ratings:
        print(f"Item {item_id} with predicted rating {rating:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Item 172 with predicted rating 4.48
Item 168 with predicted rating 4.45
Item 423 with predicted rating 4.42
Item 174 with predicted rating 4.40
Item 210 with predicted rating 4.22
Item 474 with predicted rating 4.15
Item 196 with predicted rating 4.15
Item 97 with predicted rating 4.15
Item 135 with predicted rating 4.13
User 693 recommendations:
Item 483 with predicted rating 4.72
Item 134 with predicted rating 4.37
Item 178 with predicted rating 4.37
Item 651 with predicted rating 4.35
Item 127 with predicted rating 4.33
Item 132 with predicted rating 4.32
Item 528 with predicted rating 4.30
Item 176 with predicted rating 4.25
Item 514 with predicted rating 4.23
Item 523 with predicted rating 4.20
User 402 recommendations:
Item 483 with predicted rating 4.68
Item 511 with predicted rating 4.45
Item 50 with predicted rating 4.40
Item 479 with predicted rating 4.30
Item 12 with predicted rating 4.27
Item 116 with predicte

In [19]:
!pip install pandas scikit-learn

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


movies_columns = ['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movies_columns, encoding='latin-1', engine='python')


movies = movies[['item_id', 'movie_title']]

movies['movie_title'] = movies['movie_title'].astype(str)
movies['movie_title'] = movies['movie_title'].str.replace(r'\s*\(.*\)', '', regex=True)  # Remove release year

# Adjust stop words or tokenization to retain meaningful words
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split()) # Removing stop_words='english' to see if it helps
tfidf_matrix = tfidf.fit_transform(movies['movie_title'])


cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


def recommend_movies(title, cosine_sim=cosine_sim):
    idx = movies.index[movies['movie_title'] == title].tolist()
    if not idx:
        return []
    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:4]  # Get top 3 recommendations
    movie_indices = [i[0] for i in sim_scores]
    return movies['movie_title'].iloc[movie_indices]


recommended_movies = recommend_movies('Star Wars')
print("Recommended movies for 'Star Wars':")
for movie in recommended_movies:
    print(movie)

Recommended movies for 'Star Wars':


