# Linear Regression

In [29]:
import pandas as pd
import os
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [30]:
TMDB_filename = os.path.join(os.getcwd(), "TMDB_tv_dataset_v3.csv")
df = pd.read_csv(TMDB_filename)

In [31]:
drop_columns = {'popularity', 'cleaned_overview', 'overview'}

In [32]:
# Creating our labeled examples with 'y' as our label and 'X' being our features
y = df['popularity']
X = df.drop(columns = drop_columns, axis = 1)

In [33]:
# Fill missing values in one-hot encoded columns with -1 or 'Unknown'
X.fillna(-1, inplace=True)  # or use 'Unknown'

We use train_test_split() because we want to split our data into training and test sets.

Train tests are used for fitting the model which means we train our model with this dataset.

Test sets are used to accurately evalute our final model's predicitions.

In [34]:
# Creating the training and test sets out of the labeled examples
# 30% of our data is for the test size, this will be the data used to test the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 1234)

Linear Regression

In [35]:
LR_model = LinearRegression()
LR_model.fit(X_train, y_train)

In [36]:
y_lr_pred = LR_model.predict(X_test)

In [37]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_error, root_mean_squared_error

# Compute RMSE using the new function
LR_rmse = root_mean_squared_error(y_test, y_lr_pred)

LR_mse = mean_squared_error(y_test, y_lr_pred)

# Compute R² score
LR_r2 = r2_score(y_test, y_lr_pred)

# Compute MAE using the new function
LR_mae = mean_absolute_error(y_test, y_lr_pred)

# Print the results
print('Linear Regression: Root Mean Squared Error: {}'.format(LR_rmse))
print('Linear Regression: R^2: {}'.format(LR_r2))
print('Linear Regression: Mean Absolute Error: {}'.format(LR_mae))
print('Linear Regression: Mean Squared Error: {}'.format(LR_mse))

print('done')

Linear Regression: Root Mean Squared Error: 0.011096171926217722
Linear Regression: R^2: 0.18886782362540855
Linear Regression: Mean Absolute Error: 0.001911603973657144
Linear Regression: Mean Squared Error: 0.00012312503141618232
done


## P-value
I read on https://stackoverflow.com/questions/65672273/feature-selection-in-multivariate-linear-regression that by checking the p-values, the probability the predicted result (popularity) is determined by that feature. If the p-value is <= 0.05 would be included in the model. Below I will be caluating the p-value of our features and only including the feature's with a p-value of 5% or less in hopes of increasing our R^2 value

In [38]:
import statsmodels.api as sm

In [42]:
# https://www.statology.org/statsmodels-linear-regression-p-value/

y = df['popularity']
X = df.drop(columns = drop_columns, axis = 1)

# Add constant to predictor variables
X2= sm.add_constant(X)

#Fit linear regression model
model = sm.OLS(y, X2).fit()

# View model's summary
print(model.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [41]:
print(X.dtypes)

for col in df.columns:
    if df[col].dtype == 'object':  # Check if the column is of type 'object'
        print(f"Column: {col}, Type: {df[col].dtype}")

number_of_seasons       float64
number_of_episodes      float64
vote_count              float64
vote_average            float64
adult                   float64
                         ...   
first_air_date_fall        bool
last_air_date_winter       bool
last_air_date_spring       bool
last_air_date_summer       bool
last_air_date_fall         bool
Length: 97, dtype: object
Column: overview, Type: object
Column: cleaned_overview, Type: object
