LOAD DATA

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset, trying different encodings
try:
    data = pd.read_csv('/content/IMDb Movies India.csv', encoding='utf-8')  # Try utf-8 first
except UnicodeDecodeError:
    data = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin-1')  # Try latin-1 if utf-8 fails

# Display the first few rows
print(data.head())

                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

DATA PREPROCESSING

In [None]:
# Handle missing values
data = data.dropna()

# Display the dataset info
print(data.info())


<class 'pandas.core.frame.DataFrame'>
Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   object 
 2   Duration  5659 non-null   object 
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   object 
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), object(9)
memory usage: 486.3+ KB
None


FEATURE ENGINEERING

In [None]:
# Handle missing values
data = data.dropna()

# Display the dataset info
print(data.info())
print(data.columns)





<class 'pandas.core.frame.DataFrame'>
Index: 5659 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5659 non-null   object 
 1   Year      5659 non-null   object 
 2   Duration  5659 non-null   object 
 3   Genre     5659 non-null   object 
 4   Rating    5659 non-null   float64
 5   Votes     5659 non-null   object 
 6   Director  5659 non-null   object 
 7   Actor 1   5659 non-null   object 
 8   Actor 2   5659 non-null   object 
 9   Actor 3   5659 non-null   object 
dtypes: float64(1), object(9)
memory usage: 486.3+ KB
None
Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')


In [None]:
# Select features and target variable - Update column names as needed
features = data[['Genre', 'Director', 'Actor 1']]  # Change 'Actors' to 'Actor'
target = data['Rating']

In [None]:
# One-hot encode categorical features
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(features).toarray()

# Combine the encoded features into a DataFrame
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(features.columns))

# Display the encoded features
print(encoded_df.head())

   Genre_Action  Genre_Action, Adventure  Genre_Action, Adventure, Biography  \
0           0.0                      0.0                                 0.0   
1           0.0                      0.0                                 0.0   
2           0.0                      0.0                                 0.0   
3           0.0                      0.0                                 0.0   
4           0.0                      0.0                                 0.0   

   Genre_Action, Adventure, Comedy  Genre_Action, Adventure, Crime  \
0                              0.0                             0.0   
1                              0.0                             0.0   
2                              0.0                             0.0   
3                              0.0                             0.0   
4                              0.0                             0.0   

   Genre_Action, Adventure, Drama  Genre_Action, Adventure, Family  \
0                           

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_df, target, test_size=0.2, random_state=42)

# Train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict ratings on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Display the model coefficients
print('Model Coefficients:')
print(model.coef_)


Mean Squared Error: 3.6617684515408506e+26
Model Coefficients:
[ 6.20904364e+12  6.20904364e+12  1.72921770e+14 ... -1.73767210e+11
  1.64627414e+13 -1.50613228e+13]
