# Importing necessary libraries for data manipulation, analysis, and machine learning


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Loading the dataset into a pandas DataFrame

In [3]:
df = pd.read_csv('IMDb Movies India.csv', encoding='ISO-8859-1')


# file_path = r'C:\Users\DS\Desktop\folder\IMDb-Movies-India.csv' if data is in a different directory
# df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Exploring the dataset to understand its structure and contents

In [4]:
print("First 5 rows of the dataframe:")
print(df.head())  # shows first 5 entries of rows

First 5 rows of the dataframe:
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Anga

In [5]:
print("\nInformation about the dataframe:")
print(df.info()) 


Information about the dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None


In [6]:
print("\nDescriptive statistics of the dataframe:")
print(df.describe())


Descriptive statistics of the dataframe:
            Rating
count  7919.000000
mean      5.841621
std       1.381777
min       1.100000
25%       4.900000
50%       6.000000
75%       6.800000
max      10.000000


In [7]:
# Checking the actual column names
print("\nColumn names in the DataFrame:")
print(df.columns)


Column names in the DataFrame:
Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')


# Data Preprocessing
- Preprocessing the data to handle missing values, encode categorical variables, and scale numerical features if necessary


### Handling Missing Values

In [8]:
print("\nMissing values in each column:")
print(df.isnull().sum())  # Check for missing values

df.dropna(inplace=True)  # Drop rows with missing values (or we can fill them with appropriate values)
print("\nMissing values after dropping rows:")
print(df.isnull().sum())  # Check for missing values again


Missing values in each column:
Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

Missing values after dropping rows:
Name        0
Year        0
Duration    0
Genre       0
Rating      0
Votes       0
Director    0
Actor 1     0
Actor 2     0
Actor 3     0
dtype: int64


### Encoding Categorical Variables

In [9]:
# Encode categorical variables like genre, director, and actors using one-hot encoding
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numerical_features = ['Duration']  # Example numerical features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Printing unique values of categorical columns to understand the data better
print("\nUnique values in 'Genre':")
print(df['Genre'].unique())
print("\nUnique values in 'Director':")
print(df['Director'].unique())
print("\nUnique values in 'Actor 1':")
print(df['Actor 1'].unique())
print("\nUnique values in 'Actor 2':")
print(df['Actor 2'].unique())
print("\nUnique values in 'Actor 3':")
print(df['Actor 3'].unique())


Unique values in 'Genre':
['Drama' 'Comedy, Romance' 'Comedy, Drama, Musical' 'Drama, Romance, War'
 'Horror, Mystery, Thriller' 'Action, Crime, Thriller' 'Horror'
 'Horror, Romance, Thriller' 'Comedy, Drama, Romance' 'Comedy, Drama'
 'Crime, Drama, Mystery' 'Horror, Thriller' 'Comedy, Horror'
 'Drama, Horror, Mystery' 'Action, Thriller' 'Action'
 'Horror, Mystery, Romance' 'Horror, Mystery' 'Drama, Horror, Romance'
 'Action, Drama, History' 'Action, Drama, War' 'Thriller' 'Comedy'
 'Adventure, Horror, Mystery' 'Action, Sci-Fi' 'Crime, Mystery, Thriller'
 'Drama, History' 'Sport' 'Biography, Drama, History' 'Horror, Romance'
 'Crime, Drama' 'Adventure, Drama' 'Comedy, Mystery, Thriller'
 'Drama, Romance' 'Crime, Thriller' 'Horror, Sci-Fi, Thriller'
 'Drama, Mystery, Thriller' 'Drama, Family, Musical' 'Action, Comedy'
 'Action, Adventure, Fantasy' 'Documentary' 'Drama, Horror, Musical'
 'Action, Biography, Drama' 'Action, Fantasy, Mystery'
 'Adventure, Drama, Mystery' 'Mystery, Thrille

# Feature Engineering
- Create or transform features if needed. For example, we can create a feature that counts the number of actors


In [10]:
df['num_actors'] = df[['Actor 1', 'Actor 2', 'Actor 3']].notna().sum(axis=1)

In [11]:
# Ensure numerical columns are of numeric type
df[numerical_features] = df[numerical_features].apply(pd.to_numeric, errors='coerce')

In [12]:
# Ensure target column exists and is of numeric type
if 'Rating' not in df.columns:
    raise ValueError("The 'Rating' column is not present in the dataset.")
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

In [13]:
# Splitting the data into training and testing sets
X = df.drop('Rating', axis=1)
y = df['Rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Check if there are any remaining missing values after dropping/handling
if df.isnull().sum().any():
    raise ValueError("There are still missing values in the data.")

# Building a regression model using a pipeline to streamline preprocessing and model training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

pipeline.fit(X_train, y_train)

# Evaluating the model using metrics like Mean Squared Error (MSE) and R-squared (R²)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'\nMean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Interpreting the results and considering ways to improve the model, such as using more sophisticated models like Random Forest, Gradient Boosting, or neural networks

ValueError: There are still missing values in the data.