<a href="https://colab.research.google.com/github/Bull9016/codsoft/blob/main/MoviePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load the dataset
file_path = 'IMDb Movies India.csv'
movie_data = pd.read_csv(file_path, encoding='latin1')


In [None]:
# Data exploration
print("Dataset Info:")
print(movie_data.info())
print("\nMissing Values:")
print(movie_data.isnull().sum())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB
None

Missing Values:
Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


In [None]:
# Drop rows with missing target (Rating)
movie_data = movie_data.dropna(subset=['Rating'])
print(movie_data)

                                     Name    Year Duration  \
1      #Gadhvi (He thought he was Gandhi)  (2019)  109 min   
3                                 #Yaaram  (2019)  110 min   
5                    ...Aur Pyaar Ho Gaya  (1997)  147 min   
6                               ...Yahaan  (2005)  142 min   
8                      ?: A Question Mark  (2012)   82 min   
...                                   ...     ...      ...   
15501                     Zulm Ki Hukumat  (1992)      NaN   
15503                     Zulm Ki Zanjeer  (1989)  125 min   
15504                 Zulm Ko Jala Doonga  (1988)      NaN   
15505                               Zulmi  (1999)  129 min   
15508                        Zulm-O-Sitam  (1998)  130 min   

                           Genre  Rating  Votes           Director  \
1                          Drama     7.0      8      Gaurav Bakshi   
3                Comedy, Romance     4.4     35         Ovais Khan   
5         Comedy, Drama, Musical     4.7    8

In [None]:
# Fill missing values in other columns
movie_data['Duration'] = movie_data['Duration'].str.replace(' min', '').astype(float)
movie_data['Duration'] = movie_data['Duration'].fillna(movie_data['Duration'].median())
movie_data['Votes'] = movie_data['Votes'].fillna(0)

In [None]:
# Extract year as numeric
movie_data['Year'] = movie_data['Year'].str.extract('(\d{4})').astype(float)

In [None]:
# Select relevant features
features = ['Duration', 'Genre', 'Director', 'Votes', 'Actor 1', 'Actor 2', 'Actor 3']
target = 'Rating'

In [None]:
# Handle categorical variables using one-hot encoding
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_features = encoder.fit_transform(movie_data[categorical_features])


In [None]:
movie_data['Votes'] = movie_data['Votes'].str.replace(',', '').astype(float)
# Scale numerical features
numerical_features = ['Duration', 'Votes']
scaler = StandardScaler()
scaled_features = scaler.fit_transform(movie_data[numerical_features])

In [None]:
# Combine all features
X = np.hstack([scaled_features, encoded_features])
y = movie_data[target]


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")

Mean Squared Error: 6.34
R-squared Score: -2.41
