In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
file_path = '/content/drive/MyDrive/ml project/movies_dataset_filled.csv'

In [None]:
import pandas as pd
df = pd.read_csv(file_path)

In [None]:
df.columns

Index(['title', 'original_language', 'original_title', 'overview', 'genres',
       'production_companies', 'production_countries', 'imdb_rating',
       'director'],
      dtype='object')

In [None]:
df.shape

(1039268, 9)

In [None]:
df.head(10)

Unnamed: 0,title,original_language,original_title,overview,genres,production_companies,production_countries,imdb_rating,director
0,Ariel,fi,Ariel,A Finnish man goes to the city to find a job a...,"Comedy, Drama, Romance, Crime",Villealfa Filmproductions,Finland,7.4,Aki Kaurism√§ki
1,Shadows in Paradise,fi,Varjoja paratiisissa,"Nikander, a rubbish collector and would-be ent...","Comedy, Drama, Romance",Villealfa Filmproductions,Finland,7.4,Aki Kaurism√§ki
2,Four Rooms,en,Four Rooms,It's Ted the Bellhop's first night on the job....,Comedy,"Miramax, A Band Apart",United States of America,6.7,"Allison Anders, Robert Rodriguez, Alexandre Ro..."
3,Judgment Night,en,Judgment Night,"Four young friends, while taking a shortcut en...","Action, Crime, Thriller","Largo Entertainment, JVC, Universal Pictures",United States of America,6.6,Stephen Hopkins
4,Life in Loops (A Megacities RMX),en,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,Documentary,inLoops,Austria,8.1,Timo Novotny
5,Sunday in August,de,Sonntag im August,This is a Drama type movie.,Drama,Unknown,Germany,6.8,"Anna Haas, Marc Meyer"
6,Star Wars,en,Star Wars,Princess Leia is captured and held hostage by ...,"Adventure, Action, Science Fiction","Lucasfilm Ltd., 20th Century Fox",United States of America,8.6,George Lucas
7,Finding Nemo,en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...","Animation, Family",Pixar,United States of America,8.2,Andrew Stanton
8,Forrest Gump,en,Forrest Gump,A man with a low IQ has accomplished great thi...,"Comedy, Drama, Romance","Paramount Pictures, The Steve Tisch Company, W...",United States of America,8.8,Robert Zemeckis
9,American Beauty,en,American Beauty,"Lester Burnham, a depressed suburban father in...",Drama,"DreamWorks Pictures, Jinks/Cohen Company",United States of America,8.3,Sam Mendes


In [None]:
# ml_model/train_model.py
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
# from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os
import time
from pathlib import Path

In [None]:
os.makedirs('ml_model2/artifacts', exist_ok=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
title,0
original_language,0
original_title,0
overview,0
genres,0
production_companies,0
production_countries,0
imdb_rating,0
director,0


In [None]:
def train_and_save_models():
    """
    Main function to load data, train models, and save artifacts.
    Run this in Google Colab for better performance with 1M rows.
    """
    print("üöÄ Starting model training process...")
    start_time = time.time()
    artifacts_dir = Path('/content/drive/MyDrive/ml project/movie_recommender_backend/ml_model2/artifacts')

    # Create artifacts directory if it doesn't exist
    artifacts_dir.mkdir(parents=True, exist_ok=True)
    print(f"üìÅ Artifacts will be saved to: {artifacts_dir}")

    # 2. Load your dataset from your specific path
    print("üì¶ Loading dataset...")
    try:
        df = pd.read_csv(file_path)
        print(f"‚úÖ Dataset loaded successfully! Shape: {df.shape}")
        print(f"   Columns available: {list(df.columns)}")
    except FileNotFoundError:
        print(f"‚ùå Dataset file not found at: {file_path}")
        print("   Please check the file path in Google Drive")
        return False
    except Exception as e:
        print(f"‚ùå Error loading dataset: {e}")
        return False

    # 3. Handle missing values
    print("üîß Preprocessing data...")
    # df['overview'] = df['overview'].fillna('')
    # df['genres'] = df['genres'].fillna('')
    # df['director'] = df['director'].fillna('')

    # 4. Combine text features
    print("üîÑ Combining text features...")
    df['combined_text'] = df['overview'] + ' ' + df['genres']

    # Check if we have enough data
    if len(df) == 0:
        print("‚ùå No data available after preprocessing!")
        return False

    # 5. Train TF-IDF Vectorizer
    print("üìä Training TF-IDF Vectorizer...")
    try:
        tfidf = TfidfVectorizer(
            max_features=50000,      # Reduce dimensionality
            stop_words='english',    # Remove common words
            ngram_range=(1, 2),      # Use single words and two-word phrases
            min_df=2,                # Ignore terms that appear in less than 2 documents
            max_df=0.85              # Ignore terms that appear in more than 85% of documents
        )
        tfidf_matrix = tfidf.fit_transform(df['combined_text'])
        print(f"‚úÖ TF-IDF training completed! Matrix shape: {tfidf_matrix.shape}")
    except Exception as e:
        print(f"‚ùå Error in TF-IDF training: {e}")
        return False

    # 6. Apply Dimensionality Reduction with SVD
    print("üéØ Applying SVD for dimensionality reduction...")
    try:
        svd = TruncatedSVD(
            n_components=300,    # Reduce to 300 dimensions
            random_state=42,     # For reproducible results
            n_iter=10            # Number of iterations
        )
        svd_matrix = svd.fit_transform(tfidf_matrix)
        print(f"‚úÖ SVD completed! Explained variance ratio: {svd.explained_variance_ratio_.sum():.4f}")
        print(f"   Reduced matrix shape: {svd_matrix.shape}")
    except Exception as e:
        print(f"‚ùå Error in SVD: {e}")
        return False

    # 7. Cluster movies using MiniBatchKMeans
    print("üì¶ Clustering movies with MiniBatchKMeans...")
    try:
        kmeans = MiniBatchKMeans(
            n_clusters=500,        # Number of clusters
            random_state=42,       # For reproducible results
            batch_size=1000,       # Size of mini-batches
            n_init=3,              # Number of random initializations
            max_iter=100,          # Maximum number of iterations
            verbose=1              # Show progress
        )
        df['cluster'] = kmeans.fit_predict(svd_matrix)
        print("‚úÖ Clustering completed!")
        print(f"   Number of clusters: {kmeans.n_clusters}")
    except Exception as e:
        print(f"‚ùå Error in clustering: {e}")
        return False

    # 8. Prepare final dataset for saving (keep only essential columns)
    print("üíæ Preparing data for saving...")
    essential_columns = ['title', 'cluster', 'imdb_rating', 'overview', 'genres', 'director']

    # Ensure all essential columns exist
    for col in essential_columns:
        if col not in df.columns:
            print(f"‚ùå Column '{col}' not found in dataset!")
            return False

    movies_preprocessed = df[essential_columns].copy()

    # 9. Save ALL artifacts using joblib to your specific Google Drive path
    print("üíæ Saving models and processed data...")
    try:
        # Save the trained models
        joblib.dump(tfidf, artifacts_dir / 'tfidf_vectorizer.pkl', compress=3)
        joblib.dump(svd, artifacts_dir / 'svd_model.pkl', compress=3)
        joblib.dump(kmeans, artifacts_dir / 'kmeans_model.pkl', compress=3)

        # Save the preprocessed movie data
        joblib.dump(movies_preprocessed, artifacts_dir / 'movies_preprocessed.pkl', compress=3)

        # Create a flag file to indicate training is complete
        with open(artifacts_dir / 'models_trained.flag', 'w') as f:
            f.write('Training completed successfully')

        print("‚úÖ All models and data saved successfully!")
        print(f"üìÅ Files saved to: {artifacts_dir}")
        print("   - tfidf_vectorizer.pkl")
        print("   - svd_model.pkl")
        print("   - kmeans_model.pkl")
        print("   - movies_preprocessed.pkl")
        print("   - models_trained.flag")

        # Show some statistics
        print(f"\nüìä Dataset Statistics:")
        print(f"   Total movies processed: {len(movies_preprocessed)}")
        print(f"   Number of clusters: {kmeans.n_clusters}")
        print(f"   Unique clusters used: {movies_preprocessed['cluster'].nunique()}")
        print(f"   Average movies per cluster: {len(movies_preprocessed) / movies_preprocessed['cluster'].nunique():.1f}")

        end_time = time.time()
        print(f"\n‚è∞ Total execution time: {(end_time - start_time) / 60:.2f} minutes")

    except Exception as e:
        print(f"‚ùå Error saving files: {e}")
        return False

    return True

In [None]:

def check_artifacts():
    """Check if all required artifact files exist in your Google Drive path"""
    artifacts_dir = Path('/content/drive/MyDrive/ml project/movie_recommender_backend/ml_model2/artifacts')
    required_files = [
        'tfidf_vectorizer.pkl',
        'svd_model.pkl',
        'kmeans_model.pkl',
        'movies_preprocessed.pkl',
        'models_trained.flag'
    ]

    print("üîç Checking for required artifact files...")
    print(f"üìÅ Checking path: {artifacts_dir}")
    missing_files = []

    for file in required_files:
        file_path = artifacts_dir / file
        if file_path.exists():
            print(f"   ‚úÖ {file}")
        else:
            print(f"   ‚ùå {file} (missing)")
            missing_files.append(file)

    if missing_files:
        print(f"\n‚ùå Missing {len(missing_files)} files. Please run training first.")
        return False
    else:
        print(f"\n‚úÖ All required files are present!")
        return True

if __name__ == "__main__":
    # First, mount Google Drive in Colab
    print("üìÇ Mounting Google Drive...")
    from google.colab import drive
    drive.mount('/content/drive')
    print("‚úÖ Google Drive mounted successfully!")

    # Check if artifacts already exist
    if check_artifacts():
        print("Models are already trained. Would you like to retrain? (y/n)")
        response = input().lower()
        if response != 'y':
            print("Exiting without retraining.")
            exit()

    # Run the training process
    success = train_and_save_models()

    if success:
        print("\nüéâ Training completed successfully! You can now:")
        print("   1. Download the .pkl files from Google Drive")
        print("   2. Place them in your local movie_recommender_backend/ml_model/artifacts/ folder")
        print("   3. Use test_recommendations.py to test the system")
        print("   4. Run app.py to start the web server")
    else:
        print("\nüí• Training failed! Please check the error messages above.")

üìÇ Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Google Drive mounted successfully!
üîç Checking for required artifact files...
üìÅ Checking path: /content/drive/MyDrive/ml project/movie_recommender_backend/ml_model2/artifacts
   ‚ùå tfidf_vectorizer.pkl (missing)
   ‚ùå svd_model.pkl (missing)
   ‚ùå kmeans_model.pkl (missing)
   ‚ùå movies_preprocessed.pkl (missing)
   ‚ùå models_trained.flag (missing)

‚ùå Missing 5 files. Please run training first.
üöÄ Starting model training process...
üìÅ Artifacts will be saved to: /content/drive/MyDrive/ml project/movie_recommender_backend/ml_model2/artifacts
üì¶ Loading dataset...
‚úÖ Dataset loaded successfully! Shape: (1039268, 9)
   Columns available: ['title', 'original_language', 'original_title', 'overview', 'genres', 'production_companies', 'production_countries', 'imdb_rating', 'director']
üîß Preprocessing data...
ü

In [None]:
import sys
import sklearn
import pandas as pd
import numpy as np

print(f"Python: {sys.version}")
print(f"Scikit-learn: {sklearn.__version__}")
print(f"Pandas: {pd.__version__}")
print(f"Numpy: {np.__version__}")

Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
Scikit-learn: 1.6.1
Pandas: 2.2.2
Numpy: 2.0.2


In [None]:
import os

files = ['tfidf_vectorizer.pkl', 'svd_model.pkl', 'kmeans_model.pkl', 'movies_preprocessed.pkl']
artifacts_path = "/content/drive/MyDrive/ml project/movie_recommender_backend/ml_model/artifacts/"

print("File sizes in Colab:")
for file in files:
    full_path = os.path.join(artifacts_path, file)
    if os.path.exists(full_path):
        size = os.path.getsize(full_path)
        print(f"{file}: {size} bytes")
    else:
        print(f"{file}: MISSING")

File sizes in Colab:
tfidf_vectorizer.pkl: 673086 bytes
svd_model.pkl: 115999872 bytes
kmeans_model.pkl: 2835425 bytes
movies_preprocessed.pkl: 141584151 bytes


In [None]:
# In Colab, compress the artifacts folder first
!cd "/content/drive/MyDrive/ml project/movie_recommender_backend/ml_model/" && zip -r artifacts.zip artifacts/

# Then download the zip file from Google Drive
# Extract locally instead of individual file downloads

  adding: artifacts/ (stored 0%)
  adding: artifacts/tfidf_vectorizer.pkl (deflated 0%)
  adding: artifacts/svd_model.pkl (deflated 0%)
  adding: artifacts/kmeans_model.pkl (deflated 0%)
  adding: artifacts/movies_preprocessed.pkl (deflated 0%)
  adding: artifacts/models_trained.flag (stored 0%)
