In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Load dataset
movies = pd.read_csv("/content/drive/MyDrive/Data/Final_Dataset.csv")

print(movies.head(10))

                     movie_name  \
0                            +1   
1  10 Rules for Sleeping Around   
2    10 Things I Hate About You   
3                     100 Girls   
4                      11/11/11   
5                         11:11   
6                     12 Rounds   
7                13 Going on 30   
8                      17 Again   
9                     18 Again!   

                                               Actor  \
0                                                NaN   
1  Lauren Swickard, Reid Ewing, Jesse Bradford, B...   
2  Julia Stiles, Larry Miller, Andrew Keegan, All...   
3  Marissa Ribisi, Jaime Pressly, Jonathan Tucker...   
4                                     Michael Landes   
5                                      Laura Mennell   
6  Renny Harlin, Aidan Gillen, Ashley Scott, Tayl...   
7  Ashley Benson, Gia Mantegna, Brie Larson, Chri...   
8  Sterling Knight, Zac Efron, Katrina Norman, Ji...   
9  George Burns, Jennifer Runyon, Charlie Schlatt...  

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words="english")

# Convert movie descriptions into TF-IDF feature matrix
tfidf_matrix = tfidf.fit_transform(movies['Description'].fillna(''))  # Fill NaN values

# Check matrix shape
print(tfidf_matrix.shape)

(1583, 8310)


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity matrix
content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Convert to a DataFrame
content_similarity_df = pd.DataFrame(content_similarity, index=movies['movie_name'], columns=movies['movie_name'])

print(content_similarity_df.head())

movie_name                          +1  10 Rules for Sleeping Around  \
movie_name                                                             
+1                            1.000000                           0.0   
10 Rules for Sleeping Around  0.000000                           1.0   
10 Things I Hate About You    0.000000                           0.0   
100 Girls                     0.056407                           0.0   
11/11/11                      0.000000                           0.0   

movie_name                    10 Things I Hate About You  100 Girls  11/11/11  \
movie_name                                                                      
+1                                              0.000000   0.056407       0.0   
10 Rules for Sleeping Around                    0.000000   0.000000       0.0   
10 Things I Hate About You                      1.000000   0.056166       0.0   
100 Girls                                       0.056166   1.000000       0.0   
11/11/11 

In [8]:
def recommend_movies_content(movie_title, similarity_matrix, num_recommendations=5):
    if movie_title not in similarity_matrix.index:
        return "Movie not found in dataset."

    # Get similarity scores
    sim_scores = similarity_matrix[movie_title]

    # Sort and select top similar movies
    sim_scores = sim_scores.sort_values(ascending=False)[1:num_recommendations+1]

    return sim_scores.index.tolist()

# Example
print(recommend_movies_content("100 Girls", content_similarity_df))

['Private School', 'Tomboy', 'Where the Boys Are', 'Class', 'Aquamarine']


In [9]:
# Simulate user-item matrix
user_item_matrix = movies.pivot_table(index="movie_name", values="Ratings", aggfunc="mean").fillna(0)

print(user_item_matrix.head())


                              Ratings
movie_name                           
+1                                3.0
10 Rules for Sleeping Around      3.0
10 Things I Hate About You        2.0
100 Girls                         2.0
11/11/11                          1.0


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Cosine similarity between movies based on ratings
collab_similarity = cosine_similarity(user_item_matrix, user_item_matrix)

# Convert to a DataFrame
collab_similarity_df = pd.DataFrame(collab_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

print(collab_similarity_df.head())

movie_name                     +1  10 Rules for Sleeping Around  \
movie_name                                                        
+1                            1.0                           1.0   
10 Rules for Sleeping Around  1.0                           1.0   
10 Things I Hate About You    1.0                           1.0   
100 Girls                     1.0                           1.0   
11/11/11                      1.0                           1.0   

movie_name                    10 Things I Hate About You  100 Girls  11/11/11  \
movie_name                                                                      
+1                                                   1.0        1.0       1.0   
10 Rules for Sleeping Around                         1.0        1.0       1.0   
10 Things I Hate About You                           1.0        1.0       1.0   
100 Girls                                            1.0        1.0       1.0   
11/11/11                                    

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess the actor data
movies['Actor'] = movies['Actor'].fillna('').apply(lambda x: ' '.join(x.split(',')))

actor_tfidf = TfidfVectorizer(stop_words="english")

actor_matrix = actor_tfidf.fit_transform(movies['Actor'])

# Cosine similarity between movies based on actors
actor_similarity = cosine_similarity(actor_matrix, actor_matrix)

# Convert to a DataFrame
actor_similarity_df = pd.DataFrame(actor_similarity, index=movies['movie_name'], columns=movies['movie_name'])
print(actor_similarity_df.head())


movie_name                     +1  10 Rules for Sleeping Around  \
movie_name                                                        
+1                            0.0                      0.000000   
10 Rules for Sleeping Around  0.0                      1.000000   
10 Things I Hate About You    0.0                      0.000000   
100 Girls                     0.0                      0.000000   
11/11/11                      0.0                      0.040033   

movie_name                    10 Things I Hate About You  100 Girls  11/11/11  \
movie_name                                                                      
+1                                              0.000000   0.000000  0.000000   
10 Rules for Sleeping Around                    0.000000   0.000000  0.040033   
10 Things I Hate About You                      1.000000   0.162213  0.000000   
100 Girls                                       0.162213   1.000000  0.000000   
11/11/11                                    

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

movies['genres'] = movies['genres'].apply(lambda x: x.split(','))

# MultiLabelBinarizer to convert genres into binary vectors
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies['genres'])

genre_matrix_df = pd.DataFrame(genre_matrix, columns=mlb.classes_, index=movies['movie_name'])

# Similarity using cosine similarity
genre_similarity = cosine_similarity(genre_matrix, genre_matrix)

# Convert to a DataFrame
genre_similarity_df = pd.DataFrame(genre_similarity, index=movies['movie_name'], columns=movies['movie_name'])

print(genre_similarity_df.head())


movie_name                     +1  10 Rules for Sleeping Around  \
movie_name                                                        
+1                            1.0                      0.000000   
10 Rules for Sleeping Around  0.0                      1.000000   
10 Things I Hate About You    0.0                      0.408248   
100 Girls                     0.0                      0.816497   
11/11/11                      0.0                      0.000000   

movie_name                    10 Things I Hate About You  100 Girls  11/11/11  \
movie_name                                                                      
+1                                              0.000000   0.000000       0.0   
10 Rules for Sleeping Around                    0.408248   0.816497       0.0   
10 Things I Hate About You                      1.000000   0.333333       0.0   
100 Girls                                       0.333333   1.000000       0.0   
11/11/11                                    

In [13]:
def hybrid_recommendation_with_genre(movie_title, content_similarity_df, actor_similarity_df, genre_similarity_df, alpha=0.3, beta=0.3):
    if movie_title not in content_similarity_df.index:
        return "Movie not found in dataset."

    # Get content similarity scores
    content_scores = content_similarity_df[movie_title]

    # Get actor similarity scores
    actor_scores = actor_similarity_df[movie_title]

    # Get genre similarity scores
    genre_scores = genre_similarity_df[movie_title]

    # Combine the scores
    hybrid_scores = alpha * content_scores + beta * actor_scores + (1 - alpha - beta) * genre_scores

    # Sort the scores and return the top 5 recommendations
    hybrid_scores = hybrid_scores.sort_values(ascending=False)[1:6]  # Top 5 excluding the movie itself

    return hybrid_scores.index.tolist()

# Example: Get top 5 recommendations for ""Snow White and the Seven Dwarfs""
top_5_recommendations_with_genre = hybrid_recommendation_with_genre("Snow White and the Seven Dwarfs", content_similarity_df, actor_similarity_df, genre_similarity_df, alpha=0.3, beta=0.3)

print("Top 5 recommendations:", top_5_recommendations_with_genre)


Top 5 recommendations: ['Sleeping Beauty', 'Shrek the Third', 'The Hunchback of Notre Dame', 'Freaky Friday', 'Pocahontas']


In [14]:
# Example: Get top 5 recommendations for "+1"
top_5_recommendations_with_genre = hybrid_recommendation_with_genre(
    "+1", content_similarity_df, actor_similarity_df, genre_similarity_df, alpha=0.3, beta=0.3)

print("Top 5 recommendations for '+1' with genre-based filtering:", top_5_recommendations_with_genre)


Top 5 recommendations for '+1' with genre-based filtering: ['Animal', 'Air', 'Vlad', 'Stranded', 'The Bad Batch']


In [15]:
# Remove duplicate emotions and split into individual emotions
movies['emotion'] = movies['emotion'].apply(lambda x: list(set(x.split(' | '))))

# Binary encoding for each emotion (multi-label classification)
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
emotion_matrix = mlb.fit_transform(movies['emotion'])

# DataFrame with binary encoding for emotions
emotion_matrix_df = pd.DataFrame(emotion_matrix, columns=mlb.classes_, index=movies['movie_name'])

emotion_matrix_df.head()


Unnamed: 0_level_0,anger,anticipation,disgust,fear,joy,optimism,sadness,surprise
movie_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
+1,0,0,0,1,0,0,0,0
10 Rules for Sleeping Around,0,0,0,0,1,0,0,0
10 Things I Hate About You,1,0,0,0,0,0,0,0
100 Girls,0,0,1,0,0,0,0,0
11/11/11,0,0,0,0,0,0,1,0


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the movie descriptions into numerical vectors
description_matrix = vectorizer.fit_transform(movies['Description'])

# Convert into a DataFrame
description_matrix_df = pd.DataFrame(description_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index=movies['movie_name'])
description_matrix_df.head()

Unnamed: 0_level_0,00,000,10,100,108,11,1100,12,12th,13,...,zombies,zone,zoo,zookeeper,zoom,zor,zorro,zuko,zurich,état
movie_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
+1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Rules for Sleeping Around,0.0,0.0,0.300372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100 Girls,0.0,0.0,0.0,0.260205,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11/11/11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(movies['Description'])

# Simplify emotion to a single label
movies['emotion'] = movies['emotion'].apply(lambda x: x[0])  # Use first listed emotion

# Label encoding
label_encoder = LabelEncoder()
y_single_label = label_encoder.fit_transform(movies['emotion'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_single_label, test_size=0.2, random_state=42)

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Predict and evaluate
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.416403785488959


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(movies['Description'])


# simplified to a single emotion
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(movies['emotion'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Define base models
base_learners = [
    ('naive_bayes', MultinomialNB(alpha=0.1)),  # Use the best Naive Bayes model
    ('decision_tree', DecisionTreeClassifier(max_depth=5)),  # Add Decision Tree model
    ('random_forest', RandomForestClassifier(n_estimators=50)),  # Add Random Forest model
    ('logistic_regression', LogisticRegression(max_iter=1000))  # Add Logistic Regression model
]

# Meta-model
meta_model = LogisticRegression()

# Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_model)

# Train the stacking classifier
stacking_clf.fit(X_train, y_train)

# Evaluate on the test set
y_pred = stacking_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Stacking Classifier with multiple base models: {accuracy}")




Accuracy of the Stacking Classifier with multiple base models: 0.4921135646687697


In [19]:
import warnings
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Suppress warnings
warnings.filterwarnings("ignore")

# Define reduced hyperparameter grids
param_dist_dt = {
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5]
}
param_dist_rf = {
    'n_estimators': [50, 100],
    'max_depth': [5, None],
    'min_samples_split': [2]
}
param_dist_lr = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear']
}

# Base models with RandomizedSearchCV (faster than GridSearchCV)
dt_clf = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist_dt, n_iter=4, cv=3, n_jobs=-1)
rf_clf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist_rf, n_iter=4, cv=3, n_jobs=-1)
lr_clf = RandomizedSearchCV(LogisticRegression(max_iter=1000), param_distributions=param_dist_lr, n_iter=3, cv=3, n_jobs=-1)

# Meta-models to compare
meta_models = [LogisticRegression(max_iter=1000), RandomForestClassifier(n_estimators=50)]

# Base learners
base_learners = [
    ('naive_bayes', MultinomialNB(alpha=0.1)),
    ('decision_tree', dt_clf),
    ('random_forest', rf_clf),
    ('logistic_regression', lr_clf)
]

# Track best meta-model and accuracy
best_accuracy = 0
best_meta_model = None

# Try each meta-model
for meta_model in meta_models:
    stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_model, n_jobs=-1)
    cv_score = cross_val_score(stacking_clf, X_train, y_train, cv=3, n_jobs=-1)
    avg_cv_accuracy = cv_score.mean()

    if avg_cv_accuracy > best_accuracy:
        best_accuracy = avg_cv_accuracy
        best_meta_model = meta_model

# Final training on full training data with best meta-model
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=best_meta_model, n_jobs=-1)
stacking_clf.fit(X_train, y_train)

# Test accuracy
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Final Accuracy of Optimized Stacking Classifier: {accuracy:.4f}")


Final Accuracy of Optimized Stacking Classifier: 0.5016


In [20]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch

# Load pre-trained DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
model.eval()  # Set model to evaluation mode

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [21]:
def get_distilbert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # First token ([CLS]-equivalent)
    return cls_embedding.squeeze().numpy()


In [22]:
import numpy as np
X = np.array([get_distilbert_embedding(desc) for desc in movies['Description']])

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y_single_label, test_size=0.2, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5425867507886435


In [24]:
model.save_pretrained("model/")
tokenizer.save_pretrained("model/")

('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.txt',
 'model/added_tokens.json')

In [26]:
model.save_pretrained("model/")
tokenizer.save_pretrained("model/")

import pickle
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


In [27]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load fine-tuned model and tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("model/")
model = DistilBertForSequenceClassification.from_pretrained("model/")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at model/ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
import pickle

# Load tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained("model/")
model = DistilBertForSequenceClassification.from_pretrained("model/")
model.eval()  # Set model to evaluation mode

# Load label encoder
with open("label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at model/ and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
text = "I’m feeling really joyful and excited today!"

# Tokenize the input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)


In [30]:
# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits, dim=1).item()


In [31]:
# Decode the label
predicted_label = label_encoder.inverse_transform([predicted_class])[0]
print("Predicted Emotion:", predicted_label)

Predicted Emotion: anger


In [32]:
def predict_emotion(text, model, tokenizer, label_encoder):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        predicted_class = torch.argmax(outputs.logits, dim=1).item()
    return label_encoder.inverse_transform([predicted_class])[0]


In [33]:
def recommend_movies_by_emotion(input_text, movies_df, model, tokenizer, label_encoder, top_k=5):
    # Predict emotion of the input text
    emotion = predict_emotion(input_text, model, tokenizer, label_encoder)

    # Filter movies with the same emotion
    filtered_movies = movies_df[movies_df['emotion'] == emotion]

    # Return top K titles
    return filtered_movies['movie_name'].head(top_k).tolist()


In [36]:
input_text = "I'm feeling so cheerful and energetic today!"
recommendations = recommend_movies_by_emotion(input_text, movies, model, tokenizer, label_encoder)
print("Top emotion-based recommendations:", recommendations)


Top emotion-based recommendations: ['10 Things I Hate About You', 'A Thin Line Between Love and Hate', 'About a Boy', 'Adam and Eve', 'Aenigma']


In [35]:
import numpy as np

def unified_recommendation(
    movie_title,
    content_sim,
    actor_sim,
    genre_sim,
    movies_df,
    tokenizer,
    model,
    label_encoder,
    alpha=0.25,
    beta=0.25,
    gamma=0.25,
    delta=0.25,
    top_n=5
):
    if movie_title not in content_sim.index:
        raise ValueError(f"{movie_title} not found in similarity matrices.")

    # Get indices of the movie
    idx = content_sim.index.get_loc(movie_title)

    # Extract individual similarity vectors
    content_scores = content_sim.iloc[idx].values
    actor_scores = actor_sim.iloc[idx].values
    genre_scores = genre_sim.iloc[idx].values

    # Emotion classification
    with torch.no_grad():
        inputs = tokenizer(movie_title, return_tensors="pt", truncation=True, padding=True)
        logits = model(**inputs).logits
        pred_class = torch.argmax(logits, dim=1).item()
        predicted_emotion = label_encoder.inverse_transform([pred_class])[0]

    # Build emotion similarity vector (binary match)
    emotion_vector = np.array([
        1.0 if emotion == predicted_emotion else 0.0
        for emotion in movies_df['emotion']
    ])

    # Normalize all scores
    def normalize(x):
        return (x - np.min(x)) / (np.max(x) - np.min(x)) if np.max(x) != np.min(x) else x

    content_scores = normalize(content_scores)
    actor_scores = normalize(actor_scores)
    genre_scores = normalize(genre_scores)
    emotion_vector = normalize(emotion_vector)

    #  Compute hybrid score
    hybrid_scores = (
        alpha * content_scores +
        beta * actor_scores +
        gamma * genre_scores +
        delta * emotion_vector
    )

    #Create result list excluding the input movie
    movie_indices = np.argsort(hybrid_scores)[::-1]
    top_indices = [i for i in movie_indices if content_sim.index[i] != movie_title][:top_n]
    top_movies = content_sim.index[top_indices].tolist()

    return top_movies


In [36]:
top_unified = unified_recommendation(
    "Snow White and the Seven Dwarfs",
    content_similarity_df,
    actor_similarity_df,
    genre_similarity_df,
    movies,
    tokenizer,
    model,
    label_encoder,
    alpha=0.25,
    beta=0.25,
    gamma=0.25,
    delta=0.25
)

print("Top recommendations from unified hybrid model:", top_unified)


Top recommendations from unified hybrid model: ['Shrek', 'Down to Earth', 'Nutty Professor II: The Klumps', 'Rio', 'My Super Ex-Girlfriend']


In [37]:
def recommend_by_emotion(emotion, movies_df, top_n=5):
    emotion = emotion.lower()
    matched_movies = movies_df[movies_df['emotion'].str.lower() == emotion]
    return matched_movies['movie_name'].head(top_n).tolist()


In [40]:
recommendations = recommend_by_emotion("sadness", movies)
print("Emotion-based recommendations:", recommendations)

Emotion-based recommendations: ['11/11/11', '11:11', '17 Again', '18 Again!', '1920: Evil Returns']


In [38]:
import pandas as pd
from ast import literal_eval

movies = pd.read_csv('/content/drive/MyDrive/Data/Final_Dataset.csv', converters={'genres': literal_eval})

In [39]:
def clean_nested_genre(genre_str):
    try:
        # Keep evaluating until it's a clean list
        while isinstance(genre_str, str):
            genre_str = literal_eval(genre_str)
        return genre_str
    except:
        return []  # Default if parsing fails

movies['genres'] = movies['genres'].apply(clean_nested_genre)

In [40]:
print(movies['genres'].head())

0    [Thriller, Science Fiction]
1              [Comedy, Romance]
2       [Comedy, Romance, Drama]
3       [Comedy, Drama, Romance]
4             [Horror, Thriller]
Name: genres, dtype: object


In [41]:
import requests
from IPython.display import Image, display
from urllib.parse import quote

OMDB_API_KEY = "60b53830"

def get_movie_poster(movie_title):
    try:
        # Encode movie title
        encoded_title = quote(movie_title)
        url = f"http://www.omdbapi.com/?t={encoded_title}&apikey={OMDB_API_KEY}"
        response = requests.get(url).json()

        if response.get("Poster", "N/A") != "N/A":
            return response["Poster"]
        else:
            return None
    except:
        return None

for movie in war_movies:
    poster_url = get_movie_poster(movie)
    if poster_url:
        print(f"Poster for '{movie}':")
        display(Image(url=poster_url, width=200))
    else:
        print(f"No poster found for '{movie}'")

NameError: name 'war_movies' is not defined

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from collections import defaultdict
from IPython.display import HTML, display, Image
import requests
from urllib.parse import quote
import ast
import pandas as pd

# --- Emotion Model Setup ---
model_name = "nateraw/bert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
label_list = [label for _, label in sorted(model.config.id2label.items())]

# --- Data Preparation ---
movies['genres'] = movies['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Build emotion-to-genre map
emotion_genre_map = defaultdict(set)
if 'emotion' in movies.columns:
    for _, row in movies.iterrows():
        for genre in row['genres']:
            emotion_genre_map[row['emotion']].add(genre)
    emotion_genre_map = {emotion: sorted(list(genres)) for emotion, genres in emotion_genre_map.items()}

# Build genre-to-movie map
genre_movies = defaultdict(list)
for _, row in movies.iterrows():
    for genre in row['genres']:
        genre_movies[genre.lower()].append(row['movie_name'])

# --- Helper Functions ---
def predict_emotion(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
        return label_list[predicted_class_id]

OMDB_API_KEY = "60b53830"

def get_movie_poster(movie_title):
    try:
        # Encode movie title for URL
        encoded_title = quote(movie_title)
        url = f"http://www.omdbapi.com/?t={encoded_title}&apikey={OMDB_API_KEY}"
        response = requests.get(url).json()

        if response.get("Poster", "N/A") != "N/A":
            return response["Poster"]
        else:
            return None
    except:
        return None

# --- Main Interaction Flow ---
user_input = "I'm feeling really down and stressed"
emotion = predict_emotion(user_input)
print(f"\nDetected emotion: {emotion}")

# Get genre suggestion
default_genres = emotion_genre_map.get(emotion, ["drama"])
print(f"\nSuggested genres for {emotion}: {', '.join(default_genres)}")
genre_input = input(f"\nWhich genre would you like to watch to cope with {emotion}? ").lower()

# Get recommendations
recommended = genre_movies.get(genre_input, [])[:5]  # Get top 5 movies

# --- Display Results ---
if recommended:
    print(f"\nRecommended {genre_input.capitalize()} movies:\n")
    for movie in recommended:
        poster_url = get_movie_poster(movie)
        tmdb_link = f"https://www.themoviedb.org/search?query={quote(movie)}"

        display(HTML(f"""
        <div style="margin:20px; float:left; text-align:center; width:220px; border:1px solid #ddd; padding:10px; border-radius:5px">
            <h4 style="margin:5px 0; height:50px; overflow:hidden">{movie}</h4>
            {f'<img src="{poster_url}" width="200" style="border-radius:3px">' if poster_url else '<div style="width:200px;height:300px;background:#eee;display:flex;align-items:center;justify-content:center">No poster</div>'}
            <p><a href="{tmdb_link}" target="_blank" style="color:#01b4e4;text-decoration:none">🔍 Where to Watch</a></p>
        </div>
        """))
    display(HTML("<div style='clear:both'></div>"))  # Clear float
else:
    print("\nNo recommendations found for this genre.")


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]


Detected emotion: sadness

Suggested genres for sadness: Action, Adventure, Comedy, Crime, Drama, Foreign, Horror, Romance, Science Fiction, Thriller, War


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [47]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from collections import defaultdict
from IPython.display import HTML, display, Image, clear_output
import requests
from urllib.parse import quote
import ast
import pandas as pd
import ipywidgets as widgets
import random


# --- Emotion Model Setup ---
model_name = "nateraw/bert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
label_list = [label for _, label in sorted(model.config.id2label.items())]

# --- Data Preparation ---
movies['genres'] = movies['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Build emotion-to-genre map if emotion column exists
emotion_genre_map = defaultdict(set)
if 'emotion' in movies.columns:
    for _, row in movies.iterrows():
        for genre in row['genres']:
            emotion_genre_map[row['emotion']].add(genre)
    emotion_genre_map = {emotion: sorted(list(genres)) for emotion, genres in emotion_genre_map.items()}

# Build genre-to-movie map
genre_movies = defaultdict(list)
for _, row in movies.iterrows():
    for genre in row['genres']:
        genre_movies[genre.lower()].append(row['movie_name'])

# --- Helper Functions ---
def predict_emotion(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
        return label_list[predicted_class_id]

OMDB_API_KEY = "60b53830"

def get_movie_poster(movie_title):
    try:
        encoded_title = quote(movie_title)
        url = f"http://www.omdbapi.com/?t={encoded_title}&apikey={OMDB_API_KEY}"
        response = requests.get(url).json()
        return response.get("Poster") if response.get("Poster") != "N/A" else None
    except:
        return None

# --- Interactive Widgets ---
feeling_input = widgets.Textarea(
    value="",
    placeholder="How are you feeling? (e.g., 'I'm feeling really down and stressed')",
    description='Your Mood:',
    disabled=False,
    layout={'width': '500px', 'height': '60px'}
)

genre_dropdown = widgets.Dropdown(
    options=[],
    description='Genre:',
    disabled=True
)

recommend_button = widgets.Button(
    description="Get Recommendations",
    disabled=True,
    button_style='success'
)

output_area = widgets.Output()

def get_random_recommendations(genre, n=5):
    movies_in_genre = genre_movies.get(genre.lower(), [])
    if len(movies_in_genre) > n:
        return random.sample(movies_in_genre, n)
    else:
        return movies_in_genre

recommended = get_random_recommendations(genre_input, 5)

# --- UI Section ---
def on_feeling_change(change):
    if change['new']:
        emotion = predict_emotion(change['new'])
        genres = emotion_genre_map.get(emotion, ["drama"])
        with output_area:
            clear_output()
            print(f"\nDetected emotion: {emotion}")
            print(f"\nSuggested genres for {emotion}: {', '.join(genres)}")
        genre_dropdown.options = genres
        genre_dropdown.description = f'Choose genre:'
        genre_dropdown.disabled = False
    else:
        genre_dropdown.disabled = True

def on_recommend_click(b):
    with output_area:
        clear_output()
        print(f"\nDetected emotion: {predict_emotion(feeling_input.value)}")
        print(f"\nSuggested genres: {', '.join(genre_dropdown.options)}")
        print(f"\nYou selected: {genre_dropdown.value}")

        recommended = get_random_recommendations(genre_dropdown.value, 5)

        if recommended:
            print(f"\nRecommended {genre_dropdown.value.capitalize()} movies:\n")
            for movie in recommended:
                poster_url = get_movie_poster(movie)
                tmdb_link = f"https://www.themoviedb.org/search?query={quote(movie)}"

                display(HTML(f"""
                <div style="margin:20px; float:left; text-align:center; width:220px; border:1px solid #ddd; padding:10px; border-radius:5px">
                    <h4 style="margin:5px 0; height:50px; overflow:hidden">{movie}</h4>
                    {f'<img src="{poster_url}" width="200" style="border-radius:3px">' if poster_url else '<div style="width:200px;height:300px;background:#eee;display:flex;align-items:center;justify-content:center">No poster</div>'}
                    <p><a href="{tmdb_link}" target="_blank" style="color:#01b4e4;text-decoration:none">🔍 Where to Watch</a></p>
                </div>
                """))
            display(HTML("<div style='clear:both'></div>"))
        else:
            print("\nNo recommendations found for this genre.")

# [Rest of the widget setup code remains the same]

feeling_input.observe(on_feeling_change, names='value')
recommend_button.on_click(on_recommend_click)

# Enable recommend button when genre is selected
def on_genre_change(change):
    recommend_button.disabled = not bool(change['new'])
genre_dropdown.observe(on_genre_change, names='value')

# --- Display UI ---
display(widgets.VBox([
    widgets.HTML("<h2>Movie Recommendation System</h2>"),
    feeling_input,
    genre_dropdown,
    recommend_button,
    output_area
]))

VBox(children=(HTML(value='<h2>Movie Recommendation System</h2>'), Textarea(value='', description='Your Mood:'…

In [48]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from collections import defaultdict
from IPython.display import HTML, display, Image, clear_output
import requests
from urllib.parse import quote
import ast
import pandas as pd
import ipywidgets as widgets
import random
import numpy as np

# --- Emotion Model Setup ---
model_name = "nateraw/bert-base-uncased-emotion"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
label_list = [label for _, label in sorted(model.config.id2label.items())]

# --- Data Preparation ---
movies['genres'] = movies['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Normalize ratings to 0-5 scale
if 'Ratings' in movies.columns:
    movies['rating_5'] = movies['Ratings'] / 2  # Convert 0-10 scale to 0-5
else:
    movies['rating_5'] = np.random.uniform(2.5, 5, len(movies))  # Mock ratings if none exist

# Build emotion-to-genre map
emotion_genre_map = defaultdict(set)
if 'emotion' in movies.columns:
    for _, row in movies.iterrows():
        for genre in row['genres']:
            emotion_genre_map[row['emotion']].add(genre)
    emotion_genre_map = {emotion: sorted(list(genres)) for emotion, genres in emotion_genre_map.items()}

# Build genre-to-movies map with ratings
genre_movies = defaultdict(list)
for _, row in movies.iterrows():
    for genre in row['genres']:
        genre_movies[genre.lower()].append({
            'title': row['movie_name'],
            'Ratings': row['rating_5'],
            'raw_rating': row.get('rating', None)
        })

# --- Helper Functions ---
def predict_emotion(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
        return label_list[predicted_class_id]

def get_movie_poster(movie_title):
    try:
        url = f"http://www.omdbapi.com/?t={quote(movie_title)}&apikey=60b53830"
        response = requests.get(url).json()
        return response.get("Poster") if response.get("Poster") != "N/A" else None
    except:
        return None

def get_star_rating(rating):
    """Convert numeric rating to star icons"""
    full_stars = int(rating)
    half_star = 1 if rating - full_stars >= 0.5 else 0
    empty_stars = 5 - full_stars - half_star
    stars = '★' * full_stars + '½' * half_star + '☆' * empty_stars
    return f"{stars} ({rating:.1f}/5)"

def get_recommendations(genre, n=5, sort_by_rating=False):
    """Get recommendations with sorting option"""
    movies_in_genre = genre_movies.get(genre.lower(), [])

    if sort_by_rating:
        movies_in_genre = sorted(movies_in_genre, key=lambda x: x['Ratings'], reverse=True)[:n]
    else:
        if len(movies_in_genre) > n:
            movies_in_genre = random.sample(movies_in_genre, n)

    return movies_in_genre[:n]

# --- Interactive Widgets ---
feeling_input = widgets.Textarea(
    placeholder="How are you feeling?",
    layout={'width': '500px', 'height': '60px'}
)

genre_dropdown = widgets.Dropdown(description='Genre:', disabled=True)
sort_toggle = widgets.ToggleButton(
    value=False,
    description='Sort by Rating',
    icon='star'
)
recommend_button = widgets.Button(description="Get Movies", button_style='success')
output_area = widgets.Output()

# --- UI Logic ---
def on_feeling_change(change):
    if change['new']:
        emotion = predict_emotion(change['new'])
        genres = emotion_genre_map.get(emotion, ["drama"])
        with output_area:
            clear_output()
            print(f"Detected emotion: {emotion}")
            print(f"Suggested genres: {', '.join(genres)}")
        genre_dropdown.options = genres
        genre_dropdown.disabled = False

def on_recommend_click(b):
    with output_area:
        clear_output()
        if not genre_dropdown.value:
            return

        recommended = get_recommendations(
            genre_dropdown.value,
            n=5,
            sort_by_rating=sort_toggle.value
        )

        print(f"\nRecommended {genre_dropdown.value.capitalize()} movies:")
        if sort_toggle.value:
            print("(Sorted by Rating)")
        else:
            print("(Random selection)")

        for movie in recommended:
            poster_url = get_movie_poster(movie['title'])
            tmdb_link = f"https://www.themoviedb.org/search?query={quote(movie['title'])}"

            display(HTML(f"""
            <div style="margin:20px; float:left; width:220px; border:1px solid #ddd; padding:10px; border-radius:5px; text-align:center">
                <h4 style="margin:5px 0; height:50px; overflow:hidden">{movie['title']}</h4>
                {f'<img src="{poster_url}" width="200" style="border-radius:3px">' if poster_url else '<div style="width:200px;height:300px;background:#eee;display:flex;align-items:center;justify-content:center">No poster</div>'}
                <p style="color:gold; font-size:18px; margin:5px 0">{get_star_rating(movie['Ratings'])}</p>
                <p><a href="{tmdb_link}" target="_blank" style="color:#01b4e4">🔍 Where to Watch</a></p>
            </div>
            """))
        display(HTML("<div style='clear:both'></div>"))

# Connect widgets
feeling_input.observe(on_feeling_change, names='value')
recommend_button.on_click(on_recommend_click)

def update_button_state(change):
    recommend_button.disabled = not genre_dropdown.value

genre_dropdown.observe(update_button_state, names='value')

# --- Display UI ---
display(widgets.VBox([
    widgets.HTML("<h2>🎬 Mood-Based Movie Recommender</h2>"),
    feeling_input,
    widgets.HBox([genre_dropdown, sort_toggle]),
    recommend_button,
    output_area
]))

VBox(children=(HTML(value='<h2>🎬 Mood-Based Movie Recommender</h2>'), Textarea(value='', layout=Layout(height=…