<a href="https://colab.research.google.com/github/Chakinamnithya/CODESOFT_INTERNSHIP/blob/main/CodeSoftTask2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**IMPORTING LIBRARIES**

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

**LOADING DATASET **

In [None]:

try:
    df = pd.read_csv("/content/IMDb Movies India.csv", encoding="ISO-8859-1")
except:
    df = pd.read_csv("/content/IMDb Movies India.csv", encoding="cp1252")

print("✅ Dataset Loaded Successfully")
print(df.head())
print(df.info())


✅ Dataset Loaded Successfully
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angan

**FEATURE SELECTION**

In [None]:

# Combine Actor columns into one
df['Actors'] = df[['Actor 1', 'Actor 2', 'Actor 3']].fillna('').agg(' '.join, axis=1)

# Keep relevant columns
df = df[['Genre', 'Director', 'Actors', 'Rating']]

# Drop missing target values
df.dropna(subset=['Rating'], inplace=True)

print("\n✅ After Cleaning:")
print(df.head())


✅ After Cleaning:
                       Genre        Director  \
1                      Drama   Gaurav Bakshi   
3            Comedy, Romance      Ovais Khan   
5     Comedy, Drama, Musical    Rahul Rawail   
6        Drama, Romance, War  Shoojit Sircar   
8  Horror, Mystery, Thriller   Allyson Patel   

                                            Actors  Rating  
1        Rasika Dugal Vivek Ghamande Arvind Jangid     7.0  
3               Prateik Ishita Raj Siddhant Kapoor     4.4  
5  Bobby Deol Aishwarya Rai Bachchan Shammi Kapoor     4.7  
6    Jimmy Sheirgill Minissha Lamba Yashpal Sharma     7.4  
8            Yash Dave Muntazir Ahmad Kiran Bhatia     5.6  


**SPLITING THE DATA INTO TRAIN AND TEST DATA**

In [None]:

X = df[['Genre', 'Director', 'Actors']]
y = df['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**PREPROCESSING AND MODEL PIPELINE**

In [None]:

preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), ['Genre', 'Director', 'Actors'])]
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

**MODEL TRAINING**

In [None]:
# Train model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

**MODEL EVALUATION**

In [None]:

print("\n📊 Model Performance:")
print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


📊 Model Performance:
MAE : 1.1602741658507576
RMSE: 1.4911487832671195
R² Score: -0.1959930643556651


**PREDICTING NEW VALUES**

In [None]:

sample = pd.DataFrame({
    'Genre': ['Drama'],
    'Director': ['Rajkumar Hirani'],
    'Actors': ['Aamir Khan Kareena Kapoor']
})

print("\n🎥 Predicted Rating for Sample Movie:", model.predict(sample)[0])


🎥 Predicted Rating for Sample Movie: 7.3806306917837095
