In [6]:
import pandas as pd
import statsmodels.formula.api as smf
from sklearn import linear_model

# load in the IMDB movie dataset
df = pd.read_csv('imdb_top_1000.csv')

# display the dataset info
# print(df.info())
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

In [10]:
df_mov = df.copy()

In [12]:
# dropping unwanted columns
df_mov.drop(columns=['Poster_Link','Overview','Director','Star1', 'Star2', 'Star3', 'Star4','Certificate',],inplace=True)



In [14]:
# Extract numeric values from string columns
df_mov['Runtime'] = df_mov['Runtime'].str.extract('(\d+)').astype(int)
df_mov['Gross'] = df_mov['Gross'].str.replace(',', '').astype(float)

  df_mov['Runtime'] = df_mov['Runtime'].str.extract('(\d+)').astype(int)


In [16]:
df_mov['genre_list'] = df_mov['Genre'].apply(lambda x: [g.strip() for g in x.split(',')])

In [18]:
#flatter all genres Just FYI 
from itertools import chain

all_genres = list(chain.from_iterable(df_mov['genre_list']))
unique_genres = list(set(all_genres))
print(unique_genres)

['Action', 'Adventure', 'Mystery', 'War', 'Thriller', 'Western', 'Drama', 'Sci-Fi', 'Horror', 'Musical', 'Romance', 'Film-Noir', 'Music', 'Crime', 'Sport', 'Animation', 'History', 'Comedy', 'Family', 'Fantasy', 'Biography']


In [22]:
genre_data = {}
for genre in unique_genres:
    genre_data[f'is_{genre.lower()}'] = df_mov['Genre'].str.contains(genre).astype(int)

# Create a DataFrame from the dictionary and join it to the original DataFrame
genre_df = pd.DataFrame(genre_data, index=df_mov.index)
df_mov = pd.concat([df_mov, genre_df], axis=1)

In [24]:
#convert non numeric to numeric 
df_mov['Released_Year'] = pd.to_numeric(df_mov['Released_Year'], errors='coerce')
df_mov = df_mov.dropna(subset=['Released_Year'])  # if you want to drop NaNs too
df_mov['Released_Year'] = df_mov['Released_Year'].astype(int)

In [28]:
# Handle missing values in relevant columns
df_mov['Runtime'] = pd.to_numeric(df_mov['Runtime'], errors='coerce')
df_mov['Meta_score'] = pd.to_numeric(df_mov['Meta_score'], errors='coerce')


In [56]:
# Select features for the model
# Base features without genres
base_features = ['Released_Year', 'Runtime','No_of_Votes']
# Add Meta_score if it exists
if 'Meta_score' in df_mov.columns:
    base_features.append('Meta_score')

# Genre features
genre_features = [f'is_{genre.lower()}' for genre in unique_genres]
# Combine all features
X = df_mov[base_features + genre_features].copy()  # Use copy to avoid potential issues
y = df_mov['IMDB_Rating'].copy()

# Fill NaN values with appropriate replacements ,as i did df_mov.info() there were no 
#all were non-null columns , there were some hidden nan value, hence i used this
X = X.fillna(X.mean())  # For numerical features
# Or drop rows with any NaN values
X = X.dropna()


In [44]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
from sklearn.linear_model import LinearRegression
# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [58]:
# Make predictions
y_pred = model.predict(X_test)

In [60]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Evaluate the model


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

# Print coefficients to understand feature importance
print("\nModel Coefficients:")
for i, feature in enumerate(X.columns):
    print(f"{feature}: {model.coef_[i]:.4f}")

Mean Squared Error: 0.0451
R-squared: 0.3115

Model Coefficients:
Released_Year: -0.0032
Runtime: 0.0020
No_of_Votes: 0.0000
Meta_score: 0.0045
is_action: -0.0101
is_adventure: -0.0949
is_mystery: -0.0003
is_war: 0.0235
is_thriller: -0.0508
is_western: -0.0341
is_drama: 0.0579
is_sci-fi: -0.0616
is_horror: -0.0467
is_musical: -0.1630
is_romance: -0.0625
is_film-noir: -0.0305
is_music: 0.0923
is_crime: -0.0000
is_sport: 0.0038
is_animation: 0.1141
is_history: -0.0548
is_comedy: 0.0094
is_family: 0.0034
is_fantasy: -0.0460
is_biography: -0.0255


In [270]:
def predict_movie_rating(year, runtime, metascore, genres,no_of_votes):
    """
    Predict IMDB rating based on movie features
    
    Parameters:
    year (int): Release year
    runtime (int): Runtime in minutes
    metascore (int): Metascore rating (0-100)
    genres (list): List of genres for the movie
    No of votes : Votes for the movie
    
    Returns:
    float: Predicted IMDB rating
    """
    # Create a feature array with the exact size your model expects (24)
    features = np.zeros((1, 25))
    
    # Set base features - make sure these indices match training
    features[0, 0] = year
    features[0, 1] = runtime
    features[0, 2] = metascore
    features[0,3] = np.log1p(no_of_votes)
    
    # Set genre flags - ensure you're using the exact same genre order
    # as was used during training
    genre_start_index = 4  # Adjust based on your model structure
    for genre in genres:
        if genre in all_genres:
            genre_index = all_genres.index(genre)
            # Make sure this index doesn't exceed your feature count
            if genre_start_index + genre_index < 25:
                features[0, genre_start_index + genre_index] = 1
                 # Predict
    raw_prediction = model.predict(features)[0]

    # Clip to IMDb rating range
    clipped_prediction = np.clip(raw_prediction, 0, 10)

    return clipped_prediction
    
   



In [62]:
print(model.coef_)

[-3.22532614e-03  1.96752987e-03  4.95282832e-07  4.46693337e-03
 -1.00645101e-02 -9.48985972e-02 -2.87375039e-04  2.35103255e-02
 -5.08024422e-02 -3.41337088e-02  5.79096212e-02 -6.15696311e-02
 -4.66995389e-02 -1.62970719e-01 -6.24872320e-02 -3.04811327e-02
  9.23453623e-02 -1.47298844e-05  3.78383719e-03  1.14121580e-01
 -5.47790024e-02  9.37077850e-03  3.37629981e-03 -4.59875589e-02
 -2.55194542e-02]


In [274]:

# Example usage
new_movie = {
    'year': 2025,
    'runtime': 97,
    'metascore': 70,
    'genres': ['Adventure', 'Fantasy','Family','Romance','Musical'],
    'no_of_votes':2000
}

predicted_rating = predict_movie_rating(
    new_movie['year'], 
    new_movie['runtime'], 
    new_movie['metascore'], 
    new_movie['genres'],
    new_movie['no_of_votes']
)

print(f"\nPredicted IMDB Rating for the new movie: {predicted_rating:.2f}")




Predicted IMDB Rating for the new movie: 7.31




In [None]:
#for companion movie - i am getting 7.82 
# new_movie = {
#     'year': 2025,
#     'runtime': 97,
#     'metascore': 70,
#     'genres': ['Thriller', 'Sci-Fi','Horror','Romance']
# }

# snowhite = {
#     'year': 2025,
#     'runtime': 109,
#     'metascore': 50,
#     'genres': ['Adventure', 'Fantasy','Family','Romance','Musical'],
#     'no_of_votes':2000
# }


