In [16]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import pickle as pkl
from tqdm import tqdm

In [None]:
data = pd.read_csv('')

In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
np.random.seed(444)
data = {
     'industry': ['mining', 'transportation', 'hospitality', 'finance', 'entertainment'],
    'debt_ratio':np.random.randn(5),
    'cash_flow':np.random.randn(5) + 90
 }
data = pd.DataFrame.from_dict(data)
data = pd.concat((
     data,
     pd.get_dummies(data['industry'], drop_first=True)), axis=1)
# You could also use data.drop('industry', axis=1)
# in the call to pd.concat()
data

### OLS Regression Results

#### Categories that don't need OHE
'directors', 'actors', 'genre'

In [14]:
y = data.pop('gross')
# X = data
X = pd.concat((
    data,
    pd.get_dummies(data, columns= ['language', 'country', 'rating'], drop_first=True)), axis=1)


stand_X = (X - np.mean(X, axis = 0)) / np.std(X, axis = 0)
stand_X

stand_Y = Y

X2 = stand_X
Y2 = stand_Y

X2 = sm.add_constant(X2, prepend=False)
mod = sm.OLS(Y2, X2)
res = mod.fit()
print(res.summary())
print(res.params)

NameError: name 'df' is not defined

### Data Transformations

In [None]:
# y = data.pop('gross')
# X = data
# X = data.drop(['gross'], axis=1)

# Shuffle to false to handle time data
X_train, y_train, X_test, y_test = train_test_split(X, y, random_state = 42, shuffle=False, test_size = 0.2)
X_train, y_train, X_val, y_val = train_test_split(X_train, y_train, random_state = 42, shuffle=False, test_size = 0.2)

In [8]:
categorical_cols = []

In [None]:
# Text Preprocessor
text_preprocesser = Pipeline(
    steps=[
        # Input tfidf parameters
        ('tfidf', TfidfVectorizer())
    ])

In [None]:
# Categorical Preprocessor
categorical_preprocessor = Pipeline(
    steps=[
        # Change to 'ignore' if error raised
        ("OHE", OneHotEncoder(handle_unknown='error', drop='first'))
    ])

In [9]:
#Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_preprocesser, 'plot'),
        ('category', categorical_preprocessor, categorical_cols)
    ])

In [None]:
lr_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    LinearRegression()
)

lasso_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Lasso()
)

ridge_pipe = make_pipeline(
    preprocessor, 
    StandardScaler(with_mean=False), 
    Ridge()
)

In [12]:
## if make_pipeline doesn't work
# pipe = Pipeline(
#         steps= [
#             ('preprocessor', preprocessor),
#             ('scaler', StandardScaler(with_mean=False)),
#             ('linearReg', LinearRegression())
#         ])

### Linear Regression Base Model

In [None]:
pipe.fit(X_train, y_train)
train_score = pipe.score(X_train, y_train)
val_score = pipe.score(X_val, y_val)
test_score = pipe.score(X_test, y_test)
val_pred = pipe.predict(X_val)
test_pred = pipe.predict(X_test)

print('Linear Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred))

### Ridge Base Model

In [None]:
ridge_pipe.fit(X_train, y_train)
train_score = ridge_pipe.score(X_train, y_train)
val_score = ridge_pipe.score(X_val, y_val)
test_score = ridge_pipe.score(X_test, y_test)
val_pred = ridge_pipe.predict(X_val)
test_pred = ridge_pipe.predict(X_test)

print('Ridge Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred))

### Lasso Base Model

In [None]:
lasso_pipe.fit(X_train, y_train)
train_score = lasso_pipe.score(X_train, y_train)
val_score = lasso_pipe.score(X_val, y_val)
test_score = lasso_pipe.score(X_test, y_test)
val_pred = lasso_pipe.predict(X_val)
test_pred = lasso_pipe.predict(X_test)

print('Lasso Regression Results')
print("Train score:", train_score)
print("Val score:", val_score)
print("Test score:", test_score)
print("Val RMSE:", np.sqrt(mean_squared_error(y_val, val_pred))
print("Test RMSE:", np.sqrt(mean_squared_error(y_test, test_pred))

### GridSearchCV

#### Ridge

In [None]:
param_grid = {
    "regressor__alpha": np.logspace(-3, 3, 7),
#     "regressor__max_iter": np.logspace(1, 5, 5)
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, verbose = 2)
grid_search.fit(X_train, y_train)  # click on the diagram below to see the details of each step
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")

#### Lasso

In [None]:
param_grid = {
    "regressor__alpha": np.logspace(-3, 3, 7),
#     "regressor__max_iter": np.logspace(1, 5, 5)
}

grid_search = GridSearchCV(pipe, param_grid=param_grid, n_jobs=-1, verbose = 2)
grid_search.fit(X_train, y_train)  # click on the diagram below to see the details of each step
print("Best params:", grid_search.best_params_)
print(f"Internal CV score: {grid_search.best_score_:.3f}")