In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import random

In [37]:
def generate_label(row):
    for name, value in row.items():
        label = random.random()*value
    return label + random.random()
def generate_data(num_features, num_size):
    np.random.seed(42)
    cols = []
    for i in range(1,num_features+1):
        cols.append('X_' + str(i))
    df = pd.DataFrame(np.random.rand(num_size, num_features), columns=cols)
    df['label'] = df.apply(generate_label,axis=1)
    return df
all_df = generate_data(10,1000)

In [38]:
from sklearn.model_selection import train_test_split
def split_data(df,label_column):
    X = df.copy()
    y = df[label_column]
    X.drop(columns=label_column,inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = split_data(all_df,'label')

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import math
def linear_regression(X_train, X_test, y_train, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    print("Coefficients:", model.coef_)
    print("Intercept:", model.intercept_)

    
    df_feature_imp = pd.DataFrame({'variable':X_train.columns, 'imp':model.coef_})
    print('Feature Importance:\n', df_feature_imp)

    print('R-squared:', model.score(X_test, y_test))

    model_predictions = model.predict(X_test)
    print('RMSE: ', math.sqrt(mean_squared_error(y_test,model_predictions)))
linear_regression(X_train, X_test, y_train, y_test)

Coefficients: [ 0.0468079  -0.00069355  0.08657899 -0.0601108   0.0289736   0.0419658
 -0.10525025 -0.0255843   0.02071983  0.46226251]
Intercept: 0.5050761857539199
Feature Importance:
   variable       imp
0      X_1  0.046808
1      X_2 -0.000694
2      X_3  0.086579
3      X_4 -0.060111
4      X_5  0.028974
5      X_6  0.041966
6      X_7 -0.105250
7      X_8 -0.025584
8      X_9  0.020720
9     X_10  0.462263
R-squared: 0.17705053511035895
RMSE:  0.33987572831404134


In [20]:
from sklearn.preprocessing import StandardScaler
def linear_regression_with_scaling(X_train, X_test, y_train, y_test):
    scaler = StandardScaler()
    scaler.fit(X_train)
    
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    print("Coefficients:", model.coef_)
    print("Intercept:", model.intercept_)

    
    #df_feature_imp = pd.DataFrame({'variable':X_train.columns, 'imp':model.coef_})
    #print('Feature Importance:\n', df_feature_imp)

    print('R-squared:', model.score(X_test, y_test))

    model_predictions = model.predict(X_test)
    print('RMSE: ', math.sqrt(mean_squared_error(y_test,model_predictions)))
linear_regression_with_scaling(X_train, X_test, y_train, y_test)

Coefficients: [-0.02145843 -0.00055304  0.02458198 -0.01761171  0.00836133  0.01226639
 -0.02973458 -0.00739245  0.0060569   0.13148112  0.03589811]
Intercept: 0.747120880153343
R-squared: 0.17783285071889143
RMSE:  0.33971414287106855


In [8]:
import statsmodels.api as sm
def stats_linear_regression(X_train, X_test, y_train, y_test):
    X_sm_train = sm.add_constant(X_train)
    X_sm_test = sm.add_constant(X_test)
    
    model = sm.OLS(y_train, X_sm_train).fit()

    print(model.summary())

    model_predictions = model.predict(X_sm_test)
    print('RMSE: ', math.sqrt(mean_squared_error(y_test,model_predictions)))
    print(sm.OLS(y_test, model_predictions).fit().summary())
stats_linear_regression(X_train, X_test, y_train, y_test)

                            OLS Regression Results                            
Dep. Variable:                  label   R-squared:                       0.158
Model:                            OLS   Adj. R-squared:                  0.147
Method:                 Least Squares   F-statistic:                     14.81
Date:                Thu, 27 Feb 2025   Prob (F-statistic):           2.26e-24
Time:                        14:40:45   Log-Likelihood:                -243.81
No. Observations:                 800   AIC:                             509.6
Df Residuals:                     789   BIC:                             561.2
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.5051      0.068      7.398      0.0

In [17]:
def quadratic_regression(X_train, X_test, y_train, y_test):
    # introducing quadratic variable
    X_train['X_11'] = X_train['X_1']*X_train['X_1']
    X_test['X_11'] = X_test['X_1']*X_test['X_1']

    model = LinearRegression()
    model.fit(X_train, y_train)

    feat_importance = pd.DataFrame({
        'feat': X_train.columns,
        'importance':model.coef_
    })
    feat_importance['importance'] = feat_importance['importance'].round(2)
    print('R-squared for test:', model.score(X_test, y_test))
    print('Feature importance:', feat_importance)
    

    model_stat = sm.OLS(y_train, sm.add_constant(X_train)).fit()

    print(model_stat.summary())
quadratic_regression(X_train, X_test, y_train, y_test)

R-squared for test: 0.17783285071889143
Feature importance:     feat  importance
0    X_1       -0.08
1    X_2       -0.00
2    X_3        0.08
3    X_4       -0.06
4    X_5        0.03
5    X_6        0.04
6    X_7       -0.10
7    X_8       -0.03
8    X_9        0.02
9   X_10        0.46
10  X_11        0.12
                            OLS Regression Results                            
Dep. Variable:                  label   R-squared:                       0.159
Model:                            OLS   Adj. R-squared:                  0.147
Method:                 Least Squares   F-statistic:                     13.51
Date:                Thu, 27 Feb 2025   Prob (F-statistic):           6.90e-24
Time:                        18:57:37   Log-Likelihood:                -243.53
No. Observations:                 800   AIC:                             511.1
Df Residuals:                     788   BIC:                             567.3
Df Model:                          11                   

In [32]:
from sklearn.preprocessing import LabelEncoder
def one_hot_encoding(X_train, X_test, y_train, y_test,encoding_type='label'):
    random_str = random.choices('ABC', k=len(X_train))
    X_train['X_cat'] = random_str

    random_str = random.choices('ABC', k=len(X_test))
    X_test['X_cat'] = random_str

    if encoding_type=='label':
        encoder = LabelEncoder()
        encoder.fit(X_train['X_cat'])
        X_train_cat = X_train.copy()
        X_test_cat = X_test.copy()
        X_train_cat['X_cat'] = encoder.transform(X_train['X_cat'])
        X_test_cat['X_cat'] = encoder.transform(X_test['X_cat'])
        

    elif encoding_type=='one_hot':
        X_train_cat = pd.get_dummies(X_train,columns=['X_cat'],drop_first=True)
        X_test_cat = pd.get_dummies(X_test,columns=['X_cat'],drop_first=True)
    else:
        print('Adjust Label')
        return
    
    model = LinearRegression()
    model.fit(X_train_cat,y_train)


    feat_importance = pd.DataFrame({
        'feat': X_train_cat.columns,
        'importance':model.coef_
    })
    feat_importance['importance'] = feat_importance['importance'].round(2)
    print('R-squared for test:', model.score(X_test_cat, y_test))
    print('Feature importance:', feat_importance)
one_hot_encoding(X_train, X_test, y_train, y_test)

R-squared for test: 0.17805030687298695
Feature importance:      feat  importance
0     X_1       -0.07
1     X_2       -0.00
2     X_3        0.08
3     X_4       -0.06
4     X_5        0.03
5     X_6        0.04
6     X_7       -0.10
7     X_8       -0.02
8     X_9        0.02
9    X_10        0.46
10   X_11        0.12
11  X_cat        0.01


In [47]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, RidgeCV
def ridge_regression(X_train, X_test, y_train, y_test):
    scaler = StandardScaler()
    column_names = X_train.columns
    scaler.fit(X_train)
    
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    model = Ridge()
    model.fit(X_train, y_train)
    
    print("Coefficients:", model.coef_)
    print("Intercept:", model.intercept_)
    print('R-squared:', model.score(X_test, y_test))
    model_predictions = model.predict(X_test)
    print('RMSE: ', math.sqrt(mean_squared_error(y_test,model_predictions)))

    lambdas = np.logspace(-3,3,10)
    model_cv = RidgeCV(alphas=lambdas, cv=5)
    model_cv.fit(X_train, y_train)

    print('Best alpha for RidgeCV is:', model_cv.alpha_)

    model_alpha = Ridge(alpha=model_cv.alpha_)
    model_alpha.fit(X_train, y_train)
    print('R-squared:', model_alpha.score(X_test, y_test))

    feature_imp = pd.DataFrame({
        'feature':X_train.columns,
        'importance':model_alpha.coef_
    })
    print(feature_imp)
ridge_regression(X_train, X_test, y_train, y_test)

Coefficients: [ 0.00163394 -0.00140457 -0.00584777 -0.01820991 -0.0034804   0.00274065
  0.01434933 -0.00868348 -0.00632897  0.13429372]
Intercept: 0.7463991375461476
R-squared: 0.15568745583078591
RMSE:  0.3247540582643224
Best alpha for RidgeCV is: 46.41588833612773
R-squared: 0.15625725146064018


AttributeError: 'numpy.ndarray' object has no attribute 'columns'