# Linear Regression

In [None]:
import pandas as pd
import os
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [None]:
drop_columns = {'popularity', 'cleaned_overview', 'overview'}

In [None]:
# Creating our labeled examples with 'y' as our label and 'X' being our features
y = df['popularity']
X = df.drop(columns = drop_columns, axis = 1)

In [None]:
# Fill missing values in one-hot encoded columns with -1 or 'Unknown'
X.fillna(-1, inplace=True)  # or use 'Unknown'

We use train_test_split() because we want to split our data into training and test sets.

Train tests are used for fitting the model which means we train our model with this dataset.

Test sets are used to accurately evalute our final model's predicitions.

In [None]:
# Creating the training and test sets out of the labeled examples
# 30% of our data is for the test size, this will be the data used to test the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1234)

In [None]:
# pd.set_option('display.max_columns', None)
# print(X_train.isnull().sum())

null_columns = X_train.columns[X_train.isnull().any()]
for col in null_columns:
    print(col)

In [None]:
bool_columns = X_train.select_dtypes(include=['bool'])
print(bool_columns.columns)

Linear Regression

In [None]:
print(X.dtypes)

for col in df.columns:
    print(f"Column: {col}, Type: {df[col].dtype}")

In [None]:
categorical_cols = X.select_dtypes(include=['object']).columns
print(categorical_cols)

In [None]:
for column in X.columns:
    missing_count = X[column].isnull().sum()
    print(f"Column: {column}, Missing Values: {missing_count}")

In [None]:
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

In [None]:
y_lr_pred = LR_model.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_error, root_mean_squared_error

# Compute RMSE using the new function
LR_rmse = root_mean_squared_error(y_test, y_lr_pred)

LR_mse = mean_squared_error(y_test, y_lr_pred)

# Compute R² score
LR_r2 = r2_score(y_test, y_lr_pred)

# Compute MAE using the new function
LR_mae = mean_absolute_error(y_test, y_lr_pred)

# Print the results
print('Linear Regression: Root Mean Squared Error: {}'.format(LR_rmse))
print('Linear Regression: R^2: {}'.format(LR_r2))
print('Linear Regression: Mean Absolute Error: {}'.format(LR_mae))
print('Linear Regression: Mean Squared Error: {}'.format(LR_mse))

print('done')