<a href="https://colab.research.google.com/github/Adheera13/CODSOFT_DS_Task3/blob/main/Codsoft_DS_Internship3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Load the dataset
file_path = '/content/imdb.csv'
data = pd.read_csv(file_path, encoding='ISO-8859-1')

# Correct the 'Year' column (removing negative values and converting to integer)
data['Year'] = pd.to_numeric(data['Year'], errors='coerce').astype('Int64')
data['Year'] = data['Year'].apply(lambda x: abs(x) if pd.notnull(x) else x)

# Convert 'Duration' to numeric (remove 'min' and convert to integer)
data['Duration'] = data['Duration'].str.replace('min', '').str.strip()
data['Duration'] = pd.to_numeric(data['Duration'], errors='coerce').astype('Int64')

# Convert 'Votes' to numeric
data['Votes'] = data['Votes'].str.replace(',', '').str.strip()
data['Votes'] = pd.to_numeric(data['Votes'], errors='coerce').astype('Int64')

# Drop rows with missing target variable 'Rating'
data = data.dropna(subset=['Rating'])

# Fill missing categorical values with 'Unknown'
data.loc[:, 'Genre'] = data['Genre'].fillna('Unknown')
data.loc[:, 'Director'] = data['Director'].fillna('Unknown')
data.loc[:, 'Actor 1'] = data['Actor 1'].fillna('Unknown')
data.loc[:, 'Actor 2'] = data['Actor 2'].fillna('Unknown')
data.loc[:, 'Actor 3'] = data['Actor 3'].fillna('Unknown')

# Fill missing 'Duration' values with the median duration
data.loc[:, 'Duration'] = data['Duration'].fillna(data['Duration'].median())

# Define the features and target variable
X = data[['Year', 'Duration', 'Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Votes']]
y = data['Rating']

# Ensure numeric columns are of numeric type
X['Year'] = pd.to_numeric(X['Year'], errors='coerce')
X['Duration'] = pd.to_numeric(X['Duration'], errors='coerce')
X['Votes'] = pd.to_numeric(X['Votes'], errors='coerce')

# OneHotEncoding for categorical features
categorical_features = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numeric_features = ['Year', 'Duration', 'Votes']

# Preprocessor pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create the model pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Root Mean Squared Error (RMSE): {rmse}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Year'] = pd.to_numeric(X['Year'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Duration'] = pd.to_numeric(X['Duration'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Votes'] = pd.to_numeric(X['Votes'], errors='coerce')


Root Mean Squared Error (RMSE): 1.0829548345204019
