In [13]:
import tkinter as tk
from tkinter import ttk, StringVar, IntVar, messagebox
from tkinter import ttk, StringVar, IntVar, messagebox
from tkinter.scrolledtext import ScrolledText 
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Function to convert JSON fields into space-separated strings
def extract_names(json_str):
    return ' '.join([item['name'] for item in json.loads(json_str)])

# Load and preprocess data
file_path = 'C:/Users/14435/Downloads/tmdb_5000_movies_with_director_UPDATED3.csv'
movies_df = pd.read_csv(file_path)
movies_df['genres'] = movies_df['genres'].apply(extract_names)
movies_df['keywords'] = movies_df['keywords'].apply(extract_names)
movies_df['combined_features'] = movies_df['genres'] + ' ' + movies_df['keywords']

# Create TF-IDF vectors for combined features
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=50)
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Preprocessor for genres and keywords
preprocessor = ColumnTransformer(
    transformers=[
        ('combined_tfidf', TfidfVectorizer(stop_words='english', max_features=50), 'combined_features')
    ],
    remainder='drop'
)

# RandomForest and SVM pipelines with reduced complexity
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=10, max_depth=10, random_state=42))
])
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(random_state=42))
])

# Split the data into a training set and a test set
X = movies_df[['combined_features']]
y = movies_df['Director'].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipelines to the training data
try:
    rf_pipeline.fit(X_train, y_train)
    svm_pipeline.fit(X_train, y_train)
except MemoryError:
    messagebox.showerror("MemoryError", "Model training could not be completed due to memory constraints.")

# Function to predict the director using Random Forest and SVM
def predict_director(genres_keywords):
    input_features = pd.DataFrame({'combined_features': [genres_keywords]})
    rf_predicted_director = rf_pipeline.predict(input_features)[0]
    svm_predicted_director = svm_pipeline.predict(input_features)[0]
    return rf_predicted_director, svm_predicted_director

# Function to get movie recommendations and predict directors
def get_feature_similarities(movie_title, num_recommendations=10):
    try:
        idx = movies_df[movies_df['original_title'] == movie_title].index[0]
    except IndexError:
        messagebox.showerror("Error", f"Movie '{movie_title}' not found in dataset.")
        return []

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]

    results = []
    input_movie_features = set(movies_df.iloc[idx]['combined_features'].split())

    for movie_idx in movie_indices:
        vote_count = movies_df.iloc[movie_idx]['vote_count']
        recommended_movie_features = set(movies_df.iloc[movie_idx]['combined_features'].split())
        common_features = input_movie_features.intersection(recommended_movie_features)
        actual_director = movies_df.iloc[movie_idx]['Director']
        
        # Predict the director for the recommended movie using both models
        rf_predicted_director, svm_predicted_director = predict_director(
            movies_df.iloc[movie_idx]['genres'] + ' ' + movies_df.iloc[movie_idx]['keywords']
        )
        
        results.append({
            'title': movies_df.iloc[movie_idx]['original_title'],
            'common_features': list(common_features),
            'vote_count': vote_count,
            'rf_predicted_director': rf_predicted_director,
            'svm_predicted_director': svm_predicted_director,
            'actual_director': actual_director
        })

    return results




# Function to predict directors for movies with missing director data
def predict_missing_directors(movies_df):
    missing_directors_df = movies_df[movies_df['Director'].isna()]
    missing_directors_df['combined_features'] = missing_directors_df['genres'] + ' ' + missing_directors_df['keywords']

    predictions_rf = []
    predictions_svm = []
    for _, row in missing_directors_df.iterrows():
        genres_keywords = row['combined_features']
        rf_pred, svm_pred = predict_director(genres_keywords)
        predictions_rf.append(rf_pred)
        predictions_svm.append(svm_pred)

    missing_directors_df['rf_predicted_director'] = predictions_rf
    missing_directors_df['svm_predicted_director'] = predictions_svm
    return missing_directors_df

def on_submit():
    movie_title = movie_title_var.get()
    num_recommendations = num_recommendations_var.get()
    
    if not movie_title:
        messagebox.showerror("Error", "Please enter a movie title.")
        return
    
    recommendations = get_feature_similarities(movie_title, num_recommendations)
    
    if not recommendations:
        messagebox.showinfo("Results", f"No recommendations found for '{movie_title}'.")
        return

    recommendations_str = '\n'.join([
        f"{rec['title']} (Common Features: {', '.join(rec['common_features'])}) "
        f"\nRandom Forest Predicted Director: {rec['rf_predicted_director']} "
        f"\nSVM Predicted Director: {rec['svm_predicted_director']} "
        f"\nActual Director: {rec['actual_director']}\n"
        for rec in recommendations
    ])
    
    # Calculate accuracy
    actual_directors = [rec['actual_director'] for rec in recommendations]
    rf_predicted_directors = [rec['rf_predicted_director'] for rec in recommendations]
    svm_predicted_directors = [rec['svm_predicted_director'] for rec in recommendations]
    
    rf_accuracy = (sum([1 for i in range(len(actual_directors)) if actual_directors[i] == rf_predicted_directors[i]]) / len(actual_directors)) * 100
    svm_accuracy = (sum([1 for i in range(len(actual_directors)) if actual_directors[i] == svm_predicted_directors[i]]) / len(actual_directors)) * 100
    
    accuracy_str = f"Random Forest Accuracy: {rf_accuracy:.2f}%\nSVM Accuracy: {svm_accuracy:.2f}%"
    
    # Display the results including accuracy in a new window
    results_window = tk.Toplevel(root)
    results_window.title(f"Results for '{movie_title}'")
    results_window.geometry('800x600')
    results_window.resizable(True, True)

    results_text = ScrolledText(results_window, wrap=tk.WORD)
    results_text.insert(tk.INSERT, recommendations_str + "\n\n" + accuracy_str)
    results_text.pack(expand=True, fill='both')
    results_window.grid_rowconfigure(0, weight=1)
    results_window.grid_columnconfigure(0, weight=1)


# GUI setup
root = tk.Tk()
root.title("Movie Recommendation System")

frame = ttk.Frame(root, padding="10")
frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S))

ttk.Label(frame, text="Enter Movie Title:").grid(row=0, column=0, sticky=tk.W, pady=5)
movie_title_var = StringVar()
movie_title_entry = ttk.Entry(frame, textvariable=movie_title_var, width=40)
movie_title_entry.grid(row=0, column=1, pady=5)

ttk.Label(frame, text="Number of Recommendations:").grid(row=1, column=0, sticky=tk.W, pady=5)
num_recommendations_var = IntVar(value=10)
num_recommendations_spinbox = ttk.Spinbox(frame, from_=1, to=20, textvariable=num_recommendations_var, width=5)
num_recommendations_spinbox.grid(row=1, column=1, pady=5, sticky=tk.W)

submit_button = ttk.Button(frame, text="Get Recommendations", command=on_submit)
submit_button.grid(row=2, column=0, columnspan=2, pady=10)

root.mainloop()

In [2]:
import pandas as pd

# Load the datasets
path_with_directors = 'C:/Users/14435/Downloads/tmdb_5000_movies_with_director_UPDATED3.csv'
path_missing_directors = 'C:/Users/14435/Downloads/tmdb_5000_movies_with_director_UPDATED.csv'

df_with_directors = pd.read_csv(path_with_directors)
df_missing_directors = pd.read_csv(path_missing_directors)

# Merge the datasets on a common column (e.g., 'title' or 'id')
# Assuming the common column is 'title'
merged_df = df_missing_directors.merge(df_with_directors[['title', 'Director']], on='title', how='left')

# Rename columns for clarity
merged_df.rename(columns={'Director_x': 'predicted_director', 'Director_y': 'actual_director'}, inplace=True)

# Drop rows where actual director data is still missing (if any)
merged_df.dropna(subset=['actual_director'], inplace=True)

# Calculate the accuracy of predictions
correct_predictions = (merged_df['predicted_director'] == merged_df['actual_director']).sum()
total_predictions = len(merged_df)

# Calculate accuracy percentage
accuracy_percentage = (correct_predictions / total_predictions) * 100

print(f"Total Correct Predictions: {correct_predictions}")
print(f"Total Predictions Made: {total_predictions}")
print(f"Accuracy: {accuracy_percentage:.2f}%")


Total Correct Predictions: 4084
Total Predictions Made: 4839
Accuracy: 84.40%
