# Group Lab 3 Code


### Imports

In [53]:
import pandas as pd
import statsmodels.formula.api as smf
from sklearn import linear_model

# load in the IMDB movie dataset
df = pd.read_csv('imdb_top_1000.csv')

# display the dataset info
# print(df.info())
df.columns

Index(['Poster_Link', 'Series_Title', 'Released_Year', 'Certificate',
       'Runtime', 'Genre', 'IMDB_Rating', 'Overview', 'Meta_score', 'Director',
       'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross'],
      dtype='object')

### Preprocessing

In [57]:
# create a copy of the dataframe
df1 = df.copy()

# drop any irrelevant columns
df1.drop(columns=['Poster_Link', 'Overview', 'No_of_Votes', 'Director', 'Star1', 'Star2', 'Star3', 'Star4'], inplace=True)

# convert the release year to numeric
df1['Release_Year'] = pd.to_numeric(df1['Released_Year'], errors='coerce') # turns any bad values to NaN
df1.drop(columns=['Released_Year'], inplace=True)

# convert gross to numeric
df1['Gross'] = df1['Gross'].str.replace(',', '')
df1['Gross'] = pd.to_numeric(df1['Gross'], errors='coerce')

# check and drop the rows that weren't able to be converted to numeric
df1 = df1[df1['Release_Year'].notna()]
df1 = df1[df1['Gross'].notna()]

# convert runtime to numeric
df1['Runtime'] = df1['Runtime'].str.replace(' min', '', regex=False)
df1['Runtime'] = df1['Runtime'].fillna(0).astype(float)

# handle missing values in the certificate column by filling with a placeholder string
df1['Certificate'] = df1['Certificate'].fillna('Unrated')

# remove any whitespaces in the certificate columns
df1['Certificate'] = df1['Certificate'].str.strip()

# ensure all certificates fall under the same naming conventions
df1['Certificate'] = df1['Certificate'].str.replace('/', '', regex=False)
df1['Certificate'] = df1['Certificate'].str.replace('TV-PG', 'PG', regex=False)
df1['Certificate'] = df1['Certificate'].str.replace('GP', 'PG', regex=False)

# Handle missing values in meta_Score by filling with the median
df1['Meta_score'] = pd.to_numeric(df1['Meta_score'], errors='coerce')  # Ensure it's numeric
df1['Meta_score'] = df1['Meta_score'].fillna(df1['Meta_score'].median())

# normalize the meta score (out of 100) to a scale of 0 to 1
df1['Meta_score'] = df1['Meta_score'] / 100

# normalize the IMDB score (out of 10) to a scale of 0 to 1
df1['IMDB_Rating'] = df1['IMDB_Rating'] / 10

# combine the two scores (simple average in this case)
df1['Combined_Rating'] = ((df1['Meta_score'] + df1['IMDB_Rating']) / 2) * 10

# drop the two original rating columns
df1.drop(columns=['Meta_score', 'IMDB_Rating'], inplace=True)

# check that all cells contain values
print(df1.info())

<class 'pandas.core.frame.DataFrame'>
Index: 830 entries, 0 to 997
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Series_Title     830 non-null    object 
 1   Certificate      830 non-null    object 
 2   Runtime          830 non-null    float64
 3   Genre            830 non-null    object 
 4   Gross            830 non-null    float64
 5   Release_Year     830 non-null    float64
 6   Combined_Rating  830 non-null    float64
dtypes: float64(4), object(3)
memory usage: 51.9+ KB
None


### Preprocessing Continued
#### Separating the genre column into separate rows

In [15]:
# # split the genres column by comma
# df_genres = df1['Genre'].str.split(',', expand=True)

# # count the number of genres for each movie (i.e., the number of splits)
# genre_counts = df_genres.notna().sum(axis=1)

# # repeat the series title for each movie according to the genre count
# df_titles_repeated = df1[['Series_Title']].loc[df1.index].copy()
# df_titles_repeated = df_titles_repeated.loc[df_titles_repeated.index.repeat(genre_counts)].reset_index(drop=True)

# # flatten the dataFrame by stacking the genre columns into a single column
# df_genres_flat = df_genres.stack().reset_index(drop=True)

# # merge the repeated titles back into the genres DataFrame
# df_genres_flat = pd.DataFrame(df_genres_flat, columns=['Genre'])
# df_genres_flat['Series_Title'] = df_titles_repeated['Series_Title']

# # strip any whitespaces from the genres
# df_genres_flat['Genre'] = df_genres_flat['Genre'].str.strip()

# print(df_genres_flat.head())

    Genre              Series_Title
0   Drama  The Shawshank Redemption
1   Crime             The Godfather
2   Drama             The Godfather
3  Action           The Dark Knight
4   Crime           The Dark Knight


### One-Hot Encoding

In [17]:
# # one-hot encode the genres
# df_genres_encoded = pd.get_dummies(df_genres_flat['Genre'], prefix='Genre', drop_first=True)

# # merge the one-hot encoded genres back into df_genres
# df_genres_flat = pd.concat([df_genres_flat, df_genres_encoded], axis=1)

# # group by Series_Title and aggregate by taking the max (since 0/1 values)
# df_genres_grouped = df_genres_flat.groupby('Series_Title', as_index=False).max()

# # merge df1 with df_genres_grouped based on Series_Title
# df1 = pd.merge(df1, df_genres_grouped, on='Series_Title', how='left')

# # one-hot encode the certificate column
# df_certificate_encoded = pd.get_dummies(df1['Certificate'], prefix='Certificate', drop_first=True)
# #print(df_certificate_encoded.columns)

# # merge the one-hot encoded certificate back into df1
# df1 = pd.concat([df1, df_certificate_encoded], axis=1)

# # drop the original original genre and cerificate columns
# df1.drop(columns=['Genre_x', 'Genre_y', 'Certificate'], inplace=True)

# print(df1.head)

<bound method NDFrame.head of                  Series_Title  Runtime        Gross  Release_Year  \
0    The Shawshank Redemption    142.0   28341469.0        1994.0   
1               The Godfather    175.0  134966411.0        1972.0   
2             The Dark Knight    152.0  534858444.0        2008.0   
3      The Godfather: Part II    202.0   57300000.0        1974.0   
4                12 Angry Men     96.0    4360000.0        1957.0   
..                        ...      ...          ...           ...   
825              Giù la testa    157.0     696690.0        1971.0   
826            Kelly's Heroes    144.0    1378435.0        1970.0   
827           The Jungle Book     78.0  141843612.0        1967.0   
828        A Hard Day's Night     87.0   13780024.0        1964.0   
829     From Here to Eternity    118.0   30500000.0        1953.0   

     Combined_Rating  Genre_Adventure  Genre_Animation  Genre_Biography  \
0               8.65            False            False            

### Linear Regression

In [47]:
# print(df1.info())


               Series_Title  Runtime        Gross  Release_Year  \
0  The Shawshank Redemption    142.0   28341469.0        1994.0   
1             The Godfather    175.0  134966411.0        1972.0   
2           The Dark Knight    152.0  534858444.0        2008.0   
3    The Godfather: Part II    202.0   57300000.0        1974.0   

   Combined_Rating  Genre_Adventure  Genre_Animation  Genre_Biography  \
0             8.65            False            False            False   
1             9.60            False            False            False   
2             8.70            False            False            False   
3             9.00            False            False            False   

   Genre_Comedy  Genre_Crime  ...  Genre_Western  Certificate_Approved  \
0         False        False  ...          False                 False   
1         False         True  ...          False                 False   
2         False         True  ...          False                 False   
3 

In [20]:
# # drop series title column as it is non-numeric
# df_rating = df1.drop(columns=['Series_Title'])

# # rename columns with special characters
# df_rating.columns = df_rating.columns.str.replace('-', '_')

# # drop rows with missing values
# df_rating = df_rating.dropna()

# # list of columns that are continuous (i.e., they should not be converted to bool)
# continuous_columns = ['Runtime', 'Gross', 'Release_Year', 'Combined_Rating']

# # identify columns that are one-hot encoded (i.e., the categorical variables)
# one_hot_columns = df_rating.columns.difference(continuous_columns)

# # convert one-hot encoded columns (categorical) to integers (0 or 1)
# df_rating[one_hot_columns] = df_rating[one_hot_columns].apply(lambda col: col.map({True: 1, False: 0}).fillna(col))

# # ensure combined rating is numeric
# df_rating['Combined_Rating'] = pd.to_numeric(df_rating['Combined_Rating'], errors='coerce')

# # define the formula for the model
# formula = 'Combined_Rating ~ ' + ' + '.join(df_rating.columns.difference(['Combined_Rating']))

# # fit the model
# model_rating = smf.ols(formula, data=df_rating).fit()

# # view the summary of the model
# print(model_rating.summary())

                            OLS Regression Results                            
Dep. Variable:        Combined_Rating   R-squared:                       0.153
Model:                            OLS   Adj. R-squared:                  0.119
Method:                 Least Squares   F-statistic:                     4.485
Date:                Tue, 08 Apr 2025   Prob (F-statistic):           1.64e-14
Time:                        23:42:39   Log-Likelihood:                -747.60
No. Observations:                 830   AIC:                             1561.
Df Residuals:                     797   BIC:                             1717.
Df Model:                          32                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               27.9507 

### Predict

In [39]:
# # define the function to predict the rating for a movie
# def predict_rating(genre, release_year, gross, runtime, certificate):
#     # prepare a dataFrame with the new input data
#     input_data = {
#         'Release_Year': [release_year],
#         'Gross': [gross],
#         'Runtime': [runtime],
#         'Certificate_Approved': [1 if certificate == 'Approved' else 0],
#         'Certificate_G': [1 if certificate == 'G' else 0],
#         'Certificate_PG': [1 if certificate == 'PG' else 0],
#         'Certificate_PG_13': [1 if certificate == 'PG-13' else 0],
#         'Certificate_Passed': [1 if certificate == 'Passed' else 0],
#         'Certificate_R': [1 if certificate == 'R' else 0],
#         'Certificate_U': [1 if certificate == 'U' else 0],
#         'Certificate_UA': [1 if certificate == 'UA' else 0],
#         'Certificate_Unrated': [1 if certificate == 'Unrated' else 0],
#         'Genre_Adventure': [1 if genre == 'Adventure' else 0],
#         'Genre_Animation': [1 if genre == 'Animation' else 0],
#         'Genre_Biography': [1 if genre == 'Biography' else 0],
#         'Genre_Comedy': [1 if genre == 'Comedy' else 0],
#         'Genre_Crime': [1 if genre == 'Crime' else 0],
#         'Genre_Drama': [1 if genre == 'Drama' else 0],
#         'Genre_Family': [1 if genre == 'Family' else 0],
#         'Genre_Fantasy': [1 if genre == 'Fantasy' else 0],
#         'Genre_Film_Noir': [1 if genre == 'Film-Noir' else 0],
#         'Genre_History': [1 if genre == 'History' else 0],
#         'Genre_Horror': [1 if genre == 'Horror' else 0],
#         'Genre_Music': [1 if genre == 'Music' else 0],
#         'Genre_Musical': [1 if genre == 'Musical' else 0],
#         'Genre_Mystery': [1 if genre == 'Mystery' else 0],
#         'Genre_Romance': [1 if genre == 'Romance' else 0],
#         'Genre_Sci_Fi': [1 if genre == 'Sci-Fi' else 0],
#         'Genre_Sport': [1 if genre == 'Sport' else 0],
#         'Genre_Thriller': [1 if genre == 'Thriller' else 0],
#         'Genre_War': [1 if genre == 'War' else 0],
#         'Genre_Western': [1 if genre == 'Western' else 0]
#     }

#     # convert the input dictionary into a DataFrame
#     input_df = pd.DataFrame(input_data)

#     # ensure the columns match exactly with the training data
#     input_df = input_df[df_rating.columns.difference(['Combined_Rating'])]

#     # make the prediction using the trained model
#     predicted_rating = model_rating.predict(input_df)
    
#     return predicted_rating[0]

# # predict the rating for a movie
# predicted_rating = predict_rating(genre=['Action', 'Fantasy', 'Romance', 'Adventure'], release_year=2024, gross=747400000, runtime=160, certificate='PG')
# # print("Predicted Rating for wicked :", predicted_rating)

Predicted Rating for M : 7.598297203858047


In [37]:
# name = input("Enter Movie name: ")
# age = int(input("Enter your age: "))
# print(f"Hello {name}, you are {age} years old!")

In [59]:
#converting Genre ,Certificate  column  to One-hot encoding

df_certificate_encoded = pd.get_dummies(df1['Certificate'], prefix='Cert', drop_first=True)
df1 = pd.concat([df1, df_certificate_encoded], axis=1)

print(df1.head(4))

               Series_Title Certificate  Runtime                 Genre  \
0  The Shawshank Redemption           A    142.0                 Drama   
1             The Godfather           A    175.0          Crime, Drama   
2           The Dark Knight          UA    152.0  Action, Crime, Drama   
3    The Godfather: Part II           A    202.0          Crime, Drama   

         Gross  Release_Year  Combined_Rating  Cert_Approved  Cert_G  Cert_PG  \
0   28341469.0        1994.0             8.65          False   False    False   
1  134966411.0        1972.0             9.60          False   False    False   
2  534858444.0        2008.0             8.70          False   False    False   
3   57300000.0        1974.0             9.00          False   False    False   

   Cert_PG-13  Cert_Passed  Cert_R  Cert_U  Cert_UA  Cert_Unrated  
0       False        False   False   False    False         False  
1       False        False   False   False    False         False  
2       False      

In [69]:
def predict_rating(genre_list, release_year, gross, runtime, certificate):
    input_data = {
        'Release_Year': [release_year],
        'Gross': [gross],
        'Runtime': [runtime],
    }

    # Dynamically get all one-hot encoded Cert_ columns from training data
    cert_columns = [col for col in df_rating.columns if col.startswith('Cert_')]
    for col in cert_columns:
        input_data[col] = [1 if col == f'Cert_{certificate}' else 0]

    # Dynamically get all one-hot encoded Genre_ columns
    genre_columns = [col for col in df_rating.columns if col.startswith('Genre_')]
    for col in genre_columns:
        genre_name = col.replace('Genre_', '')
        input_data[col] = [1 if genre_name in genre_list else 0]

    # Convert input to DataFrame
    input_df = pd.DataFrame(input_data)

    # Ensure input_df has the same columns as training data (excluding target column)
    expected_features = [col for col in df_rating.columns if col != 'Combined_Rating']
    for col in expected_features:
        if col not in input_df.columns:
            input_df[col] = 0  # fill missing columns with 0

    input_df = input_df[expected_features]  # ensure correct column order

    # Make prediction
    predicted_rating = model_rating.predict(input_df)

    return predicted_rating[0]


In [71]:
predicted_rating = predict_rating(
    genre_list=['Action', 'Adventure', 'Fantasy'],
    release_year=2024,
    gross=747400000,
    runtime=160,
    certificate='PG' 
)
print(f"Predicted Rating: {predicted_rating:.2f}")

Predicted Rating: 7.67
