In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

warnings.simplefilter(action='ignore', category=Warning)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mlx-2-0-regression/sample_submission.csv
/kaggle/input/mlx-2-0-regression/train.csv
/kaggle/input/mlx-2-0-regression/test.csv


# Loading the Data

In [2]:
train_data = pd.read_csv("/kaggle/input/mlx-2-0-regression/train.csv")
test_data = pd.read_csv("/kaggle/input/mlx-2-0-regression/test.csv")

In [None]:
train_data.head()


# Data Cleaning


In [3]:
train_data = train_data.drop_duplicates()

In [4]:
train_data.shape

(61609, 62)

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.columns

In [7]:
num_cols = train_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
obj_cols = train_data.select_dtypes(include=['object']).columns.tolist()
num_cols.remove("target")
num_cols.remove("id")

In [6]:
missing_stats = pd.DataFrame({
    'Missing Count': train_data[num_cols].isnull().sum(),
    'Missing Percentage': train_data[num_cols].isnull().mean() * 100
}).sort_values('Missing Percentage', ascending=False)

print(missing_stats[missing_stats['Missing Percentage']>0])


                            Missing Count  Missing Percentage
tempo_volatility                    10417           16.908244
beat_frequency_0                     9731           15.794770
album_name_length                    9594           15.572400
duration_ms_1                        9105           14.778685
harmonic_scale_0                     7684           12.472204
instrumental_density_1               7477           12.136214
rhythmic_cohesion_0                  7183           11.659011
emotional_charge_0                   6104            9.907643
intensity_index_0                    5971            9.691766
groove_efficiency_2                  5746            9.326559
organic_immersion_2                  5635            9.146391
rhythmic_cohesion_2                  5560            9.024655
key_variety                          5405            8.773069
emotional_resonance_2                4876            7.914428
harmonic_scale_2                     4467            7.250564
tonal_mo

In [7]:
missing_stats = pd.DataFrame({
    'Missing Count': train_data[obj_cols].isnull().sum(),
    'Missing Percentage': train_data[obj_cols].isnull().mean() * 100
}).sort_values('Missing Percentage', ascending=False)

print(missing_stats[missing_stats['Missing Percentage']>0])


                       Missing Count  Missing Percentage
creator_collective              8914           14.468665
weekday_of_release              5058            8.209839
composition_label_2             4463            7.244071
track_identifier                4436            7.200247
composition_label_0             2949            4.786638
publication_timestamp           1508            2.447694
composition_label_1             1460            2.369784
lunar_phase                     1001            1.624763
season_of_release                473            0.767745


In [None]:
import matplotlib.pyplot as plt

def plot_numerical_histograms(df, bins=30, figsize=(15, 4)):
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    n_cols = 3
    n_rows = (len(num_cols) + n_cols - 1) 

    plt.figure(figsize=(figsize[0], figsize[1]*n_rows))
    for i, col in enumerate(num_cols, 1):
        plt.subplot(n_rows, n_cols, i)
        df[col].hist(bins=bins)
        plt.title(col)
        plt.xlabel(col)
        plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_boxplots(df, columns, num_per_row=4):
    sns.set(style="whitegrid")
    num_cols = len(columns)
    num_rows = (num_cols + num_per_row - 1) 
    plt.figure(figsize=(5 * num_per_row, 4 * num_rows))
    
    for i, col in enumerate(columns, 1):
        plt.subplot(num_rows, num_per_row, i)
        sns.boxplot(x=df[col], color="skyblue")
        plt.title(col)
    
    plt.tight_layout()
    plt.show()


In [None]:
plot_numerical_histograms(train_data)

In [None]:
plot_boxplots(train_data,num_cols)

# Handling Missing Values

In [11]:
median_impute_cols = [
    'emotional_charge_2',  
    'groove_efficiency_0',  
    'groove_efficiency_1',  
    'groove_efficiency_2',  
    'beat_frequency_0',  
    'beat_frequency_1',
    'beat_frequency_2',  
    'duration_ms_0',  
    'duration_ms_1',  
    'duration_ms_2',  
    'organic_immersion_0',  
    'organic_immersion_1',  
    'organic_immersion_2', 
    'organic_texture_0',  
    'organic_texture_1',  
    'performance_authenticity_0',  
    'performance_authenticity_1',  
    'performance_authenticity_2', 
    'instrumental_density_0', 
    'instrumental_density_1',  
    'instrumental_density_2',  
    'duration_consistency',  
    'tempo_volatility',  
    'vocal_presence_0',  
    'vocal_presence_1',  
    'vocal_presence_2',  
    'intensity_index_2'  
]

train_data[median_impute_cols] = train_data[median_impute_cols].fillna(train_data[median_impute_cols].median())
test_data[median_impute_cols] = test_data[median_impute_cols].fillna(train_data[median_impute_cols].median())

In [12]:
mean_impute_cols = [
    'emotional_charge_0', 
    'emotional_charge_1',  
    'emotional_resonance_0',  
    'emotional_resonance_1',  
    'emotional_resonance_2', 
    'rhythmic_cohesion_0',  
    'rhythmic_cohesion_1',  
    'rhythmic_cohesion_2',  
    'intensity_index_0',  
    'intensity_index_1',  
]
train_data[mean_impute_cols] = train_data[mean_impute_cols].fillna(train_data[mean_impute_cols].mean())
test_data[mean_impute_cols] = test_data[mean_impute_cols].fillna(train_data[mean_impute_cols].mean())

In [13]:
mode_impute_cols = [
    'harmonic_scale_0', 
    'harmonic_scale_1', 
    'harmonic_scale_2',  
    'tonal_mode_0',  
    'tonal_mode_1',  
    'tonal_mode_2',  
    'time_signature_0',  
    'time_signature_1', 
    'time_signature_2',  
    'album_component_count',  
    'album_name_length',  
    'artist_count',  
    'key_variety',  
    'organic_texture_2',  
]

train_data[mode_impute_cols] = train_data[mode_impute_cols].fillna(train_data[mode_impute_cols].mean())
test_data[mode_impute_cols] = test_data[mode_impute_cols].fillna(train_data[mode_impute_cols].mean())

In [14]:
cat_cols_unknown = ['creator_collective']

cat_cols_mode = ['weekday_of_release','composition_label_2','track_identifier',
    'composition_label_0', 'publication_timestamp',
    'composition_label_1', 'lunar_phase', 'season_of_release'
]

train_data[cat_cols_unknown] = train_data[cat_cols_unknown].fillna("Unknown")
test_data[cat_cols_unknown] = test_data[cat_cols_unknown].fillna("Unknown")

for col in cat_cols_mode:
    mode_val = train_data[col].mode()[0]
    train_data[col] = train_data[col].fillna(mode_val)
    test_data[col] = test_data[col].fillna(mode_val)

In [None]:
train_data.isnull().sum()

In [None]:
missing_stats = pd.DataFrame({
    'Missing Count': train_data[num_cols].isnull().sum(),
    'Missing Percentage': train_data[num_cols].isnull().mean() * 100
}).sort_values('Missing Percentage', ascending=False)

print(missing_stats[missing_stats['Missing Percentage']>0])


In [None]:
missing_stats = pd.DataFrame({
    'Missing Count': train_data[obj_cols].isnull().sum(),
    'Missing Percentage': train_data[obj_cols].isnull().mean() * 100
}).sort_values('Missing Percentage', ascending=False)

print(missing_stats[missing_stats['Missing Percentage']>0])


# Encoding Categorical Features

In [None]:
data_desc = pd.DataFrame(index=train_data[num_cols].columns.to_list()) 
data_desc['type'] = train_data[num_cols].dtypes
data_desc['count'] = train_data[num_cols].count()
data_desc['unique'] = train_data[num_cols].nunique()
data_desc['null'] = train_data[num_cols].isnull().sum()

data_desc

In [None]:
data_desc = pd.DataFrame(index=train_data[obj_cols].columns.to_list()) 
data_desc['type'] = train_data[obj_cols].dtypes
data_desc['count'] = train_data[obj_cols].count()
data_desc['unique'] = train_data[obj_cols].nunique()
data_desc['null'] = train_data[obj_cols].isnull().sum()

data_desc

In [15]:
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_and_add(df,column):
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    one_hot_encoded = one_hot_encoder.fit_transform(df[[column]])
    encoded_columns = pd.DataFrame(one_hot_encoded,columns=one_hot_encoder.get_feature_names_out([column]))
    encoded_columns.index = df.index
    df = pd.concat([df,encoded_columns],axis=1)
    df = df.drop(columns=[column])
    return df


columns_to_encode = ['weekday_of_release', 'season_of_release', 'lunar_phase']


for col in columns_to_encode:
    train_data = one_hot_encode_and_add(train_data,col)


for col in columns_to_encode:
    test_data = one_hot_encode_and_add(test_data,col)


train_data

Unnamed: 0,id,emotional_charge_2,groove_efficiency_1,beat_frequency_1,organic_texture_2,composition_label_0,harmonic_scale_1,intensity_index_0,duration_ms_0,album_name_length,...,weekday_of_release_Tuesday,weekday_of_release_Wednesday,season_of_release_autumn,season_of_release_spring,season_of_release_summer,season_of_release_winter,lunar_phase_full,lunar_phase_new,lunar_phase_waning,lunar_phase_waxing
0,76339,0.482850,1.169231,80.018,0.020100,Country Stuff (feat. Jake Owen),1.000000,0.789000,154586.0,18.225723,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,80006,0.267862,1.321321,147.966,0.334000,Solitude,6.000000,0.715000,46874.0,15.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,83501,0.242606,1.285319,142.980,0.111000,BDFFRNT (Saved from Conformity),4.000000,0.604426,264665.0,7.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,81530,0.426400,1.279435,123.063,0.196000,Headlights (feat. Ilsey),5.000000,0.685000,209208.0,5.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,60534,0.000000,0.974906,132.722,0.081100,Afraid,6.000000,0.856000,215346.0,5.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61604,75677,0.261387,0.809917,90.017,0.510000,Hasret,5.192594,0.449000,218490.0,43.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
61605,29591,0.585648,1.185022,122.973,0.512000,NO HALO,5.192594,0.573000,259746.0,6.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
61606,62355,0.112203,0.469714,121.045,0.493000,Aloha! - Main Title Theme,10.000000,0.526000,79093.0,66.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
61607,88778,0.438699,1.486339,95.390,0.023700,Golden,5.192594,0.838000,208906.0,9.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
train_data

In [None]:
# drop_cols = [
#     'composition_label_0', 'composition_label_1', 'composition_label_2',
#     'track_identifier']
# train_data = train_data.drop(columns=drop_cols)
# test_data = test_data.drop(columns=drop_cols)

In [16]:
from sklearn.model_selection import KFold

def target_encode(train, test, col, target, n_splits=5, smooth=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    global_mean = train[target].mean()
    train[col + '_te'] = 0

    for train_idx, val_idx in kf.split(train):
        fold_train, fold_val = train.iloc[train_idx], train.iloc[val_idx]
        means = fold_train.groupby(col)[target].agg(['mean', 'count'])
        means['smooth'] = (means['mean'] * means['count'] + global_mean * smooth) / (means['count'] + smooth)
        train.loc[train.index[val_idx], col + '_te'] = fold_val[col].map(means['smooth']).fillna(global_mean)

    means = train.groupby(col)[target].agg(['mean', 'count'])
    means['smooth'] = (means['mean'] * means['count'] + global_mean * smooth) / (means['count'] + smooth)
    test[col + '_te'] = test[col].map(means['smooth']).fillna(global_mean)

    train.drop(columns=[col], inplace=True)
    test.drop(columns=[col], inplace=True)

    return train, test



In [17]:
train_data, test_data = target_encode(train_data, test_data,'composition_label_0','target')
train_data, test_data = target_encode(train_data, test_data,'composition_label_1','target')
train_data, test_data = target_encode(train_data, test_data,'composition_label_2','target')
train_data, test_data = target_encode(train_data, test_data,'creator_collective','target')
train_data, test_data = target_encode(train_data, test_data,'track_identifier','target')


In [18]:
train_data['publication_timestamp'] = pd.to_datetime(train_data['publication_timestamp'], errors='coerce')

train_data['release_year'] = train_data['publication_timestamp'].dt.year
train_data['release_month'] = train_data['publication_timestamp'].dt.month

train_data.drop('publication_timestamp', axis=1, inplace=True)

test_data['publication_timestamp'] = pd.to_datetime(test_data['publication_timestamp'], errors='coerce')

test_data['release_year'] = test_data['publication_timestamp'].dt.year
test_data['release_month'] = test_data['publication_timestamp'].dt.month

test_data.drop('publication_timestamp', axis=1, inplace=True)


# Feature Engineering

In [20]:
def add_engineered_features(df):
    df['mean_emotional_charge'] = df[['emotional_charge_0', 'emotional_charge_1', 'emotional_charge_2']].mean(axis=1)
    df['var_beat_frequency'] = df[['beat_frequency_0', 'beat_frequency_1', 'beat_frequency_2']].var(axis=1)
    df['max_organic_immersion'] = df[['organic_immersion_0', 'organic_immersion_1', 'organic_immersion_2']].max(axis=1)
    df['mean_groove_efficiency'] = df[['groove_efficiency_0', 'groove_efficiency_1', 'groove_efficiency_2']].mean(axis=1)
    
    df['duration_ratio_0_1'] = df['duration_ms_0'] / (df['duration_ms_1'] + 1e-6)
    df['intensity_ratio_0_2'] = df['intensity_index_0'] / (df['intensity_index_2'] + 1e-6)
    df['emotional_charge_ratio_1_2'] = df['emotional_charge_1'] / (df['emotional_charge_2'] + 1e-6)
    
    df['is_holiday_season'] = df['release_month'].isin([11, 12]).astype(int)
    df['release_quarter'] = ((df['release_month'] - 1) // 3 + 1).astype(int)
    
    df['emotional_charge_1_artist_count'] = df['emotional_charge_1'] * df.get('artist_count', 1)
    df['groove_efficiency_0_album_components'] = df['groove_efficiency_0'] * df['album_component_count']
    df['tempo_volatility_artist_count'] = df['tempo_volatility'] * df.get('artist_count', 1)
    
    df['log_duration_ms_0'] = np.log1p(df['duration_ms_0'])
    df['sqrt_tempo_volatility'] = np.sqrt(df['tempo_volatility'].clip(lower=0))
    
    df['emotional_impact'] = df[['emotional_charge_0', 'emotional_charge_1', 'emotional_charge_2',
                                'emotional_resonance_0', 'emotional_resonance_1', 'emotional_resonance_2']].mean(axis=1)
    df['rhythmic_quality'] = df[['groove_efficiency_0', 'groove_efficiency_1', 'groove_efficiency_2',
                                 'rhythmic_cohesion_1']].mean(axis=1)
    df['intensity_score'] = df[['intensity_index_0', 'intensity_index_1', 'intensity_index_2',
                               'beat_frequency_0', 'beat_frequency_1', 'beat_frequency_2']].mean(axis=1)
    
    return df

In [31]:
y = train_data['target']
X_init = train_data[['emotional_charge_2', 'groove_efficiency_1', 'beat_frequency_1',
       'organic_texture_2', 'harmonic_scale_1', 'intensity_index_0',
       'duration_ms_0', 'album_name_length', 'beat_frequency_0',
       'beat_frequency_2', 'artist_count', 'album_component_count',
       'emotional_charge_1', 'emotional_charge_0', 'tonal_mode_2',
       'key_variety', 'performance_authenticity_2',
       'performance_authenticity_0', 'time_signature_1', 'duration_ms_2',
       'instrumental_density_2', 'organic_texture_0', 'vocal_presence_2',
       'tonal_mode_1', 'vocal_presence_1', 'vocal_presence_0',
       'intensity_index_1', 'organic_immersion_0', 'tonal_mode_0',
       'groove_efficiency_2', 'instrumental_density_1', 'organic_immersion_2',
       'duration_consistency', 'organic_texture_1', 'rhythmic_cohesion_0',
       'emotional_resonance_1', 'rhythmic_cohesion_1',
       'performance_authenticity_1', 'tempo_volatility', 'organic_immersion_1',
       'groove_efficiency_0', 'emotional_resonance_2', 'time_signature_0',
       'duration_ms_1', 'harmonic_scale_0', 'time_signature_2',
       'rhythmic_cohesion_2', 'emotional_resonance_0', 'harmonic_scale_2',
       'intensity_index_2', 'instrumental_density_0',
       'weekday_of_release_Friday', 'weekday_of_release_Monday',
       'weekday_of_release_Saturday', 'weekday_of_release_Sunday',
       'weekday_of_release_Thursday', 'weekday_of_release_Tuesday',
       'weekday_of_release_Wednesday', 'season_of_release_autumn',
       'season_of_release_spring', 'season_of_release_summer',
       'season_of_release_winter', 'lunar_phase_full', 'lunar_phase_new',
       'lunar_phase_waning', 'lunar_phase_waxing', 'composition_label_0_te',
       'composition_label_1_te', 'composition_label_2_te',
       'creator_collective_te', 'track_identifier_te', 'release_year',
       'release_month']]

In [32]:
X_init = add_engineered_features(X_init.copy())
test_data = add_engineered_features(test_data.copy())

In [23]:
import pandas as pd
import numpy as np

def find_highly_correlated_features(df, threshold=0.95):
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )

    to_drop = {
        column
        for column in upper.columns
        if any(upper[column] > threshold)
    }

    return to_drop


In [24]:
redundant_features = find_highly_correlated_features(train_data, threshold=0.95)

print("Highly correlated features to drop:")
print(redundant_features)

Highly correlated features to drop:
set()


In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X,y):
    X = X.copy()
    X = X.dropna()
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X,y,discrete_features=discrete_features,random_state=0)
    mi_scores = pd.Series(mi_scores,name="MI Scores",index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores




mi_scores = make_mi_scores(X,y)

mi_scores

In [None]:
mi_scores_df = mi_scores.reset_index()
print(mi_scores_df.to_string(index=False))

In [None]:
import matplotlib.pyplot as plt

def plot_mi_scores(mi_scores):
    plt.figure(figsize=(12, 6))
    mi_scores.plot(kind='bar')
    plt.ylabel("Mutual Information Score")
    plt.title("Feature Importance (MI Scores)")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()


In [None]:
plot_mi_scores(mi_scores)

In [33]:
selected_features = [
    'composition_label_1_te', 'track_identifier_te', 'composition_label_2_te',
    'composition_label_0_te', 'organic_immersion_1', 'groove_efficiency_1',
    'beat_frequency_1', 'groove_efficiency_0', 'duration_ms_2',
    'duration_ms_0', 'log_duration_ms_0', 'emotional_charge_2',
    'duration_consistency', 'beat_frequency_2', 'organic_immersion_0',
    'emotional_charge_1', 'max_organic_immersion',
    'groove_efficiency_0_album_components', 'organic_immersion_2',
    'intensity_ratio_0_2', 'mean_groove_efficiency', 'groove_efficiency_2',
    'emotional_charge_ratio_1_2', 'emotional_charge_0', 'rhythmic_quality',
    'duration_ratio_0_1', 'emotional_charge_1_artist_count',
    'var_beat_frequency', 'duration_ms_1', 'mean_emotional_charge',
    'tempo_volatility', 'sqrt_tempo_volatility', 'beat_frequency_0',
    'intensity_score', 'organic_texture_2', 'tempo_volatility_artist_count',
    'organic_texture_1', 'organic_texture_0', 'emotional_impact',
    'creator_collective_te', 'vocal_presence_0', 'emotional_resonance_0',
    'vocal_presence_1', 'emotional_resonance_1', 'vocal_presence_2',
    'intensity_index_2', 'emotional_resonance_2', 'intensity_index_1',
    'performance_authenticity_0', 'rhythmic_cohesion_1', 'intensity_index_0',
    'instrumental_density_0', 'performance_authenticity_1',
    'performance_authenticity_2', 'rhythmic_cohesion_2',
    'rhythmic_cohesion_0', 'instrumental_density_2', 'instrumental_density_1'
]




In [13]:
len(selected_features)

58

In [34]:
top_features=['composition_label_1_te', 'track_identifier_te', 'composition_label_2_te',
    'composition_label_0_te', 'organic_immersion_1', 'groove_efficiency_1',
    'beat_frequency_1', 'groove_efficiency_0', 'duration_ms_2',
    'duration_ms_0', 'log_duration_ms_0', 'emotional_charge_2',
    'duration_consistency', 'beat_frequency_2', 'organic_immersion_0',
    'emotional_charge_1', 'max_organic_immersion',
    'groove_efficiency_0_album_components', 'organic_immersion_2',
    'intensity_ratio_0_2','mean_groove_efficiency', 'groove_efficiency_2',
    'emotional_charge_ratio_1_2', 'emotional_charge_0', 'rhythmic_quality',
    'duration_ratio_0_1', 'emotional_charge_1_artist_count',
    'var_beat_frequency', 'duration_ms_1', 'mean_emotional_charge',
    'tempo_volatility', 'sqrt_tempo_volatility', 'beat_frequency_0',
    'intensity_score', 'organic_texture_2', 'tempo_volatility_artist_count',
    'organic_texture_1', 'organic_texture_0', 'emotional_impact',
    'creator_collective_te','vocal_presence_0', 'emotional_resonance_0',
    'vocal_presence_1', 'emotional_resonance_1', 'vocal_presence_2',
    'intensity_index_2', 'emotional_resonance_2', 'intensity_index_1',
    'performance_authenticity_0', 'rhythmic_cohesion_1','intensity_index_0',
    'instrumental_density_0', 'performance_authenticity_1',
    'performance_authenticity_2', 'rhythmic_cohesion_2',
    'rhythmic_cohesion_0', 'instrumental_density_2', 'instrumental_density_1']

In [11]:
len(top_features)

58

In [35]:
X = X_init[top_features]
test_df = test_data[top_features]

# Standardizing Features

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [37]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_data_scaled = scaler.transform(test_df)

# Model Training and Testing

In [38]:
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.feature_selection import mutual_info_regression

In [40]:
ridge = Ridge()
param_grid = {'alpha': np.logspace(-3, 3, 20)}
grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(train_X, train_y)
best_ridge = grid_search.best_estimator_
print(f"Best alpha: {grid_search.best_params_['alpha']}")

Best alpha: 483.2930238571752


In [41]:
y_pred = best_ridge.predict(val_X)
rmse = np.sqrt(mean_squared_error(val_y, y_pred))
mae = mean_absolute_error(val_y, y_pred)
r2 = r2_score(val_y, y_pred)
n = val_X.shape[0]
p = val_X.shape[1]
adjusted_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Validation MAE: {mae:.4f}")
print(f"Validation R²: {r2:.4f}")
print(f"Validation Adjusted R²: {adjusted_r2:.4f}")

Validation RMSE: 13.8438
Validation MAE: 10.6667
Validation R²: 0.5885
Validation Adjusted R²: 0.5866


# Final Model Selection

In [None]:
final_model = Ridge(alpha=grid_search.best_params_['alpha'])
final_model.fit(X_scaled, y) 

# Predictions

In [None]:
test_predictions = final_model.predict(test_data_scaled)


# Submission

In [None]:
submission = pd.read_csv('/kaggle/input/mlx-2-0-regression/sample_submission.csv')
submission

In [None]:
submission['target'] = test_predictions
submission

In [None]:
submission.to_csv("submission_ridge.csv",index=False)