In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import joblib


Loading and Preprocessing the Dataset

In [None]:
# Load the movie dataset with the 'latin-1' encoding
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Codsoft/Task 2/IMDb Movies India.csv', encoding='latin-1')

# Data preprocessing
data.dropna(subset=['Rating'], inplace=True)
data['Rating'] = data['Rating'].astype(float)

# preprocessing the "Year" column to remove non-numeric characters and convert to float
data['Year'] = data['Year'].str.extract('(\d+)').astype(float)

# preprocessing the "Duration" column to extract the numeric part and convert to float
data['Duration'] = data['Duration'].str.extract('(\d+)').astype(float)

# preprocessing the "Votes" column to remove commas and convert to integer
data['Votes'] = data['Votes'].str.replace(',', '').astype(int)

# Convert categorical variables (e.g., genres) to dummy variables
genre_dummies = data['Genre'].str.get_dummies(sep=', ')
data = pd.concat([data, genre_dummies], axis=1)


Feature Extraction

In [None]:

# Extract relevant features
X = data.drop(['Name', 'Rating', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Genre'], axis=1)
y = data['Rating']


Data Splitting

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Imputing Missing Values

In [None]:

# Instantiate the imputer with the mean strategy
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and testing data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)


Training the Movie Rating Prediction Model

In [None]:
# Create and train the regression model
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

Model Evaluation

In [None]:
# Make predictions on the test set
y_pred = regression_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')



Mean Squared Error: 1.514619017215519
R-squared: 0.185314269389396


Saving the Trained Model to a .pkl File

In [None]:

# Save the trained model to a .pkl file
joblib.dump(regression_model, '/content/drive/MyDrive/Colab Notebooks/Codsoft/Task 2/movie_rating_model.pkl')

['/content/drive/MyDrive/Colab Notebooks/Codsoft/Task 2/movie_rating_model.pkl']

Loading the Model and Make Predictions on New Data

In [None]:

# Load the saved model from the .pkl file
loaded_model = joblib.load('/content/drive/MyDrive/Colab Notebooks/Codsoft/Task 2/movie_rating_model.pkl')

# Example: Predict the rating for a new movie
new_data = pd.DataFrame({
    'Year': [2022],
    'Duration': [120],
    'Votes': [5000],
    'Drama': [0],
    'Comedy': [1],
    'Action': [1],
    'Horror': [0],
    'Mystery': [0],
    'Thriller': [1],
})

# Ensure that the order and presence of dummy variables match the training data
missing_columns = set(X.columns) - set(new_data.columns)
for column in missing_columns:
    new_data[column] = 0  # Add missing dummy variables with value 0

new_data = new_data[X.columns]  # Select only the columns present in X

new_prediction = loaded_model.predict(new_data)
print(f'Predicted Rating for New Movie: {new_prediction[0]}')


Predicted Rating for New Movie: 4.882184845354189


