In [4]:
#Importing relevant packages
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# Loading the dataset
df = pd.read_csv("/content/original_data_season2.csv")

# Defining the columns to impute in the dataset
columns_to_impute = ['RHR', 'HRV', 'Recovery', 'Sleep.Score', 'Hours.in.Bed',
       'Hours.of.Sleep', 'Sleep.Need', 'Sleep.Efficiency....', 'Wake.Periods',
       'Sleep.Disturbances', 'Latency..min.', 'Cycles', 'REM.Sleep..hours.',
       'Deep.Sleep..hours.', 'Light.Sleep..hours.', 'Awake..hours.',
       'Sleep.Debt..hours.', 'Sleep.Consistency', 'Respiratory.Rate',
       'Total.Cycle.Sleep.Time..hours.', 'REM.Percentage',
       'Deep.Sleep.Percentage', 'Restorative.Sleep..hours.',
       'Restorative.Sleep....', 'PPC', 'MPC', 'EB', 'OR', 'MS', 'LA', 'NES',
       'OS', 'Body.Weight..kg.', 'Peak.Power.Mean', 'Peak.Power.SD',
       'Peak.Power.CoV', 'Peak.Power.BM.Mean', 'Peak.Power.BM.SD',
       'Peak.Power.BM.CoV', 'RSI.Mean', 'RSI.SD', 'RSI.CoV',
       'Jump.Height.Mean', 'Jump.Height.SD', 'Jump.Height.CoV',
       'Week.Trimp.Total', 'Daily.Average', 'Weekly.SD', 'Monotony', 'Strain',
       'RT.Volume.Load', 'HR.min..bpm.', 'HR.avg..bpm.', 'HR.max..bpm.',
       'HR.min....', 'HR.avg....', 'HR.max....', 'Distance...min..m.min.',
       'Maximum.speed..km.h.', 'Average.speed..km.h.', 'Sprints',
       'Distance.in.Speed.zone.1..m...1.00...4.99.km.h.',
       'Distance.in.Speed.zone.2..m...5.00...6.99.km.h.',
       'Distance.in.Speed.zone.3..m...7.00...10.99.km.h.',
       'Distance.in.Speed.zone.4..m...11.00...14.99.km.h.',
       'Distance.in.Speed.zone.5..m...15.00..km.h.',
       'Number.of.accelerations...50.00....3.00.m.s².',
       'Number.of.accelerations...2.99....2.00.m.s².',
       'Number.of.accelerations...1.99....1.00.m.s².',
       'Number.of.accelerations...0.99....0.50.m.s².',
       'Number.of.accelerations..0.50...0.99.m.s².',
       'Number.of.accelerations..1.00...1.99.m.s².',
       'Number.of.accelerations..2.00...2.99.m.s².',
       'Number.of.accelerations..3.00...50.00.m.s².', 'Calories..kcal.',
       'Training.load.score', 'Cardio.load', 'Recovery.time..h.']

# Imputing using regression for specified columns
for column in columns_to_impute:
    missing_data = df[df[column].isnull()]
    complete_data = df.dropna(subset=[column])

    # Separating features and target
    X_complete = complete_data.drop(columns=[column])
    y_complete = complete_data[column]
    X_missing = missing_data.drop(columns=[column])

    # Imputing missing values
    imputer = SimpleImputer(strategy='mean')
    X_complete_imputed = imputer.fit_transform(X_complete)
    X_missing_imputed = imputer.transform(X_missing)

    # Fitting the linear regression model
    model = LinearRegression()
    model.fit(X_complete_imputed, y_complete)

    # Predicting missing values
    predicted_values = model.predict(X_missing_imputed)
    df.loc[df[column].isnull(), column] = predicted_values

# Saving the imputed dataset in csv file
df.to_csv("perfect_regression_imputation_all_columns.csv", index=False)

# Checking if any missing values remain after performing imputation
missing_values_after_imputation = df.isnull().sum().sum()

if missing_values_after_imputation == 0:
    print("Imputation successful: No missing values remain.")
else:
    print(f"Imputation failed: {missing_values_after_imputation} missing values remain.")

Imputation successful: No missing values remain.
