In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns # For all our visualization needs.
import statsmodels.api as sm # What does this do? Cross-sectional models and methods. The API focuses on models and the most frequently used statistical test, and tools.
from statsmodels.graphics.api import abline_plot # What does this do? Plot a line given an intercept and slope.
from sklearn.metrics import mean_squared_error, r2_score # What does this do? APIs for evaluating the quality of a model’s predictions
from sklearn.model_selection import train_test_split #  What does this do? Split arrays or matrices into random train and test subsets.
from sklearn import linear_model, preprocessing # What does this do? Set of methods intended for regression in which the target value is expected to be a linear combination of the features. Common utility functions and transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators
import warnings # For handling error messages.
# Don't worry about the following two instructions: they just suppress warnings that could occur later. 
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
# scipi is a library for statistical tests and visualizations 
from scipy import stats
# random enables us to generate random numbers
import random

In [2]:
df = pd.read_csv('Data/EDA_cleaned.csv')
print(df.head())

   Unnamed: 0  Year  ($)/(count)     type  company
0           0  2023          8.4  Revenue   disney
1           1  2023         33.7  Revenue  netflix
2           2  2023         11.2  Revenue     hulu
3           3  2023          2.8  Revenue   twitch
4           4  2023         86.0  Revenue  youtube


In [3]:
df = df.drop("Unnamed: 0", axis = 1)

In [4]:
print(df.head())

   Year  ($)/(count)     type  company
0  2023          8.4  Revenue   disney
1  2023         33.7  Revenue  netflix
2  2023         11.2  Revenue     hulu
3  2023          2.8  Revenue   twitch
4  2023         86.0  Revenue  youtube


In [5]:
df.dtypes

Year             int64
($)/(count)    float64
type            object
company         object
dtype: object

In [6]:
dfo=df.select_dtypes(include=['object']) # select object type columns
df = pd.concat([df.drop(dfo, axis=1), pd.get_dummies(dfo)], axis=1)

In [7]:
print(df.head())

   Year  ($)/(count)  type_Revenue  type_Subscribers  company_disney  \
0  2023          8.4          True             False            True   
1  2023         33.7          True             False           False   
2  2023         11.2          True             False           False   
3  2023          2.8          True             False           False   
4  2023         86.0          True             False           False   

   company_hulu  company_netflix  company_prime  company_tiktok  \
0         False            False          False           False   
1         False             True          False           False   
2          True            False          False           False   
3         False            False          False           False   
4         False            False          False           False   

   company_twitch  company_youtube  
0           False            False  
1           False            False  
2           False            False  
3            Tru

In [8]:
df

Unnamed: 0,Year,($)/(count),type_Revenue,type_Subscribers,company_disney,company_hulu,company_netflix,company_prime,company_tiktok,company_twitch,company_youtube
0,2023,8.4,True,False,True,False,False,False,False,False,False
1,2023,33.7,True,False,False,False,True,False,False,False,False
2,2023,11.2,True,False,False,True,False,False,False,False,False
3,2023,2.8,True,False,False,False,False,False,False,True,False
4,2023,86.0,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
107,2016,79.9,False,True,False,False,True,False,False,False,False
108,2016,12.0,False,True,False,True,False,False,False,False,False
109,2016,0.0,False,True,True,False,False,False,False,False,False
110,2016,46.0,False,True,False,False,False,True,False,False,False


In [9]:
X = df.drop(columns=['($)/(count)'])

In [10]:
X.head()

Unnamed: 0,Year,type_Revenue,type_Subscribers,company_disney,company_hulu,company_netflix,company_prime,company_tiktok,company_twitch,company_youtube
0,2023,True,False,True,False,False,False,False,False,False
1,2023,True,False,False,False,True,False,False,False,False
2,2023,True,False,False,True,False,False,False,False,False
3,2023,True,False,False,False,False,False,False,True,False
4,2023,True,False,False,False,False,False,False,False,True


In [11]:
y = df['($)/(count)']

In [12]:
X_train, X_test, y_train, y_test=train_test_split(X, y, 
                                                  test_size = 0.25,
                                                  random_state = 246)