In [1]:
!pip install termcolor
!pip install pandas 
!pip install tqdm

[0m

### Utils 

In [2]:
# For Pyspark with No none
from pyspark.sql.functions import col
from pyspark.sql.types import FloatType, IntegerType
from termcolor import colored
from tqdm import tqdm
import numpy as np
import pandas as pd
from IPython.display import display

def ReduceMemory(df: pd.DataFrame):
    """
    This function reduces the associated dataframe's memory usage.
    It reassigns the data-types of columns according to their min-max values.
    It also displays the dataframe information after memory reduction.
    """;
    
    # Reducing float column memory usage:-
    for col in tqdm(df.iloc[0:2, 1:].select_dtypes('float').columns):
        col_min = np.amin(df[col].dropna());
        col_max = np.amax(df[col].dropna());
        
        if col_min >= np.finfo(np.float16).min and col_max <= np.finfo(np.float16).max: 
            df[col] = df[col].astype(np.float16)
        elif col_min >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max : 
            df[col] = df[col].astype(np.float32)
        else: pass;

    # Reducing integer column memory usage:-
    for col in tqdm(df.iloc[0:2, 1:].select_dtypes('int').columns):
        col_min = df[col].min(); 
        col_max = df[col].max();
        
        if col_min >= np.iinfo(np.int8).min and col_max <= np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8);
        elif col_min >= np.iinfo(np.int16).min and col_max <= np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16);
        elif col_min >= np.iinfo(np.int32).min & col_max <= np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32);
        else: pass;
        
    print(colored(f"\nDataframe information after memory reduction\n", 
                  color = 'blue', attrs= ['bold']));
    display(df.info()); 
    
    return df;
    

### Model

In [3]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
[0m

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math # For positional encoding in Transformer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class MLPRegressor(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(MLPRegressor, self).__init__()
        layers = []
        for i in range(len(hidden_sizes)):
            if i == 0:
                layers.append(nn.Linear(input_size, hidden_sizes[i]))
            else:
                layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
            layers.append(nn.ReLU())

        self.layers = nn.Sequential(*layers)
        self.output_layer = nn.Linear(hidden_sizes[-1], output_size)

    def forward(self, x):
        x = self.layers(x)
        x = self.output_layer(x)
        return x

    
class ResidualBlock(nn.Module):
    def __init__(self, input_size, output_size):
        super(ResidualBlock, self).__init__()
        self.fc = nn.Linear(input_size, output_size)
        self.activation = nn.ReLU()

        # Adding a linear transformation for the residual link if input and output sizes differ
        if input_size != output_size:
            self.shortcut = nn.Linear(input_size, output_size)
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        identity = self.shortcut(x)
        out = self.fc(x)
        out = self.activation(out + identity)
        return out

class MLPResidualRegressor(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(MLPResidualRegressor, self).__init__()

        layers = []
        current_size = input_size
        for hidden_size in hidden_sizes:
            layers.append(ResidualBlock(current_size, hidden_size))
            current_size = hidden_size

        self.layers = nn.Sequential(*layers)
        self.output_layer = nn.Linear(hidden_sizes[-1], output_size)

    def forward(self, x):
        x = self.layers(x)
        x = self.output_layer(x)
        return x



## Pipeline

In [5]:
from pyspark.ml.feature import (
    Imputer,
    StandardScaler,
    StringIndexer,
    OneHotEncoder,
    VectorAssembler,
)
from pyspark.ml import Pipeline, Transformer
import os
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import lit, monotonically_increasing_id
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
import pyspark
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn as nn
import gc

In [6]:
# Config 
SEED = 3407
IS_SPARKML = True



def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=SEED)

In [7]:
spark = SparkSession.builder.appName("Task3").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/16 18:49:38 INFO SparkEnv: Registering MapOutputTracker
23/11/16 18:49:38 INFO SparkEnv: Registering BlockManagerMaster
23/11/16 18:49:38 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
23/11/16 18:49:38 INFO SparkEnv: Registering OutputCommitCoordinator


In [8]:
# define some config
continuous_cols = [
    "potential",
    "value_eur",
    "wage_eur",
    "age",
    "height_cm",
    "weight_kg",
    "club_team_id",
    "league_level",
    "nationality_id",
    "weak_foot",
    "skill_moves",
    "international_reputation",
    "pace",
    "shooting",
    "passing",
    "dribbling",
    "defending",
    "physic",
    "attacking_crossing",
    "attacking_finishing",
    "attacking_heading_accuracy",
    "attacking_short_passing",
    "attacking_volleys",
    "skill_dribbling",
    "skill_curve",
    "skill_fk_accuracy",
    "skill_long_passing",
    "skill_ball_control",
    "movement_acceleration",
    "movement_sprint_speed",
    "movement_agility",
    "movement_reactions",
    "movement_balance",
    "power_shot_power",
    "power_jumping",
    "power_stamina",
    "power_strength",
    "power_long_shots",
    "mentality_aggression",
    "mentality_interceptions",
    "mentality_positioning",
    "mentality_vision",
    "mentality_penalties",
    "defending_marking_awareness",
    "defending_standing_tackle",
    "defending_sliding_tackle",
    "goalkeeping_diving",
    "goalkeeping_handling",
    "goalkeeping_kicking",
    "goalkeeping_positioning",
    "goalkeeping_reflexes",
    "ls",
    "st",
    "rs",
    "lw",
    "lf",
    "cf",
    "rf",
    "rw",
    "lam",
    "cam",
    "ram",
    "lm",
    "lcm",
    "cm",
    "rcm",
    "rm",
    "lwb",
    "ldm",
    "cdm",
    "rdm",
    "rwb",
    "lb",
    "lcb",
    "cb",
    "rcb",
    "rb",
    "gk",
    "year"
]
position_binary_cols = [
    "Position_CF",
    "Position_LW",
    "Position_LM",
    "Position_RM",
    "Position_RW",
    "Position_ST",
    "Position_GK",
    "Position_CM",
    "Position_CDM",
    "Position_RB",
    "Position_CB",
    "Position_CAM",
    "Position_LB",
    "Position_RWB",
    "Position_LWB",
]
nominal_cols = ["league_name", "club_position", "work_rate"]
# nominal_cols = ["club_position", "work_rate"]
ordinal_cols = ["preferred_foot"]

cols_to_drop = [
    'id',
    'long_name',
    "player_url",
    "player_face_url",
    "club_logo_url",
    "club_flag_url",
    "nation_logo_url",
    "nation_flag_url",
    "sofifa_id",
    "short_name",
    "dob",
    "club_name",
    "club_jersey_number",
    "club_loaned_from",
    "nationality_name",
    "nation_jersey_number",
    "body_type",
    "real_face",
    "goalkeeping_speed",
    "club_contract_valid_until",
    "nation_team_id",
    "nation_position",
    "player_tags",
    "player_traits",
    "release_clause_eur",
    "long_name",
]


In [9]:
# Create a class to process data
# import spark ml related libraries
class OutcomeCreater(Transformer):
    def __init__(self):
        super().__init__()
        
    def _transform(self, df):
        df = df.withColumnRenamed(
            "overall", "outcome"
        )  # rename the overall column to outcome
        return df


class ColumnDropper(Transformer):
    def __init__(self, cols_to_drop=None):
        super().__init__()
        self.cols_to_drop = cols_to_drop

    def _transform(self, df):
        return df.drop(*self.cols_to_drop)


class DataPreprocess1(Transformer):
    """for columns like ls, st..., gk
    columns that contains + or - as string
    """

    def __init__(self, cols_to_preprocess) -> None:
        super().__init__()
        self.cols_to_preprocess = cols_to_preprocess

    def _transform(self, df):
        from pyspark.sql.functions import split
        from pyspark.sql.types import IntegerType

        for col in self.cols_to_preprocess:
            df = df.withColumn(col, split(
                df[col], r'\+|-').getItem(0).cast(IntegerType()))
        return df


class DataPreprocess2(Transformer):
    """
    Transforme the columns in Positions to binary columns
    """

    def __init__(self, cols_to_preprocess) -> None:
        super().__init__()
        self.cols_to_preprocess = cols_to_preprocess

    def _transform(self, dataset: DataFrame) -> DataFrame:
        from pyspark.sql.functions import split, when, col, array_contains
        import itertools
        for column in self.cols_to_preprocess:
            split_positions = split(dataset[column], ', ')
            self.distinct_positions = list(set(list(itertools.chain(
                *dataset.select(split_positions.alias('positions')).distinct().rdd.flatMap(lambda x: x).collect()))))
            print(self.distinct_positions)
            for position in tqdm(self.distinct_positions):
                dataset = dataset.withColumn(
                    'Position_' + position,
                    when(array_contains(split_positions, position), 1).otherwise(0)
                )
            
        dataset = dataset.drop(*self.cols_to_preprocess)
        return dataset
    
class MissingValueModeImputer(Transformer):
    def __init__(self, cols_to_impute=None):
        super().__init__()
        self.cols_to_impute = cols_to_impute

    def _transform(self, df):
        if not self.cols_to_impute:
            return df
        for column_name in self.cols_to_impute:
            df = self._fill_mode(df, column_name)
        return df
    def _fill_mode(self, df, col_name):
        # Calculate the mode 
        mode = df.groupBy(col_name).count().orderBy('count', ascending=False).first()[0]
        return df.na.fill({col_name: mode})

def get_preprocess_pipeline():

    # Stage for columns to preprocess2
    stage_column_pre1 = DataPreprocess1(["ls",
                                        "st",
                                        "rs",
                                        "lw",
                                        "lf",
                                        "cf",
                                        "rf",
                                        "rw",
                                        "lam",
                                        "cam",
                                        "ram",
                                        "lm",
                                        "lcm",
                                        "cm",
                                        "rcm",
                                        "rm",
                                        "lwb",
                                        "ldm",
                                        "cdm",
                                        "rdm",
                                        "rwb",
                                        "lb",
                                        "lcb",
                                        "cb",
                                        "rcb",
                                        "rb",
                                        "gk",])
    
    # Stage for columns to preprocess2
    stage_column_pre2 = DataPreprocess2(["player_positions"])
    
    # Stage where nominal columns are handled by imputer
    cols_to_impute_nominal = ["league_name", "club_position"]
    stage_missing_handler = MissingValueModeImputer(cols_to_impute=cols_to_impute_nominal)
    
    # find all cols that contains missing values in continuous_cols
    cols_to_imputer_numerical = []
    for col in continuous_cols:
        if data.select(col).filter(data[col].isNull()).count() > 0:
            cols_to_imputer_numerical.append(col)
    print(cols_to_imputer_numerical)
    
    from pyspark.ml.feature import Imputer
    stage_missing_handler2 = Imputer(strategy='mean', inputCols=cols_to_imputer_numerical, outputCols=cols_to_imputer_numerical)
    
    # Stage where nominal columns are transformed  to index columns using StringIndexer
    nominal_id_cols = [x+"_index" for x in nominal_cols]
    nominal_onehot_cols = [x+"_onehot" for x in nominal_cols]
    stage_nominal_indexer = StringIndexer(
        inputCols=nominal_cols, outputCols=nominal_id_cols)
    

    
    
    # Stage where nominal columns are transformed to onehot columns using OneHotEncoder
    stage_nominal_onehot = OneHotEncoder(
        inputCols=nominal_id_cols, outputCols=nominal_onehot_cols)

    # Stage where ordinal columns are transformed to index columns using StringIndexer
    ordinal_id_cols = [x+"_index" for x in ordinal_cols]
    stage_ordinal_indexer = StringIndexer(
        inputCols=ordinal_cols, outputCols=ordinal_id_cols)

    feature_cols = continuous_cols + position_binary_cols + ordinal_id_cols + nominal_onehot_cols
    #feature_cols =  continuous_cols + ordinal_id_cols + nominal_onehot_cols

    # Stage where all the features are assembled into a single vector
    stage_vector_assembler = VectorAssembler(
        inputCols=feature_cols, outputCol="vectorized_features")

    # Stage where we scale the columns
    stage_scaler = StandardScaler(
        inputCol='vectorized_features', outputCol='features')

    # Stage for creating the outcome column representing whether there is attack
    stage_outcome = OutcomeCreater()
    
    # Stage for columns dropping
    stage_column_dropper = ColumnDropper(cols_to_drop=cols_to_drop + feature_cols + ordinal_cols + nominal_cols + nominal_id_cols + ['vectorized_features'])
    
    # Connect the columns into a pipeline
    pipeline = Pipeline(stages=[stage_column_pre1,
                                stage_column_pre2,
                                stage_missing_handler,
                                stage_missing_handler2,
                                stage_nominal_indexer,
                                stage_nominal_onehot,
                                stage_ordinal_indexer,
                                stage_vector_assembler,
                                stage_scaler,
                                stage_outcome,
                                stage_column_dropper])
    
    return pipeline

In [10]:
# train utils
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd 
from pyspark.sql.functions import col
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def train_model(model, train_loader, val_loader, epochs, learning_rate):
    model = model.to(device)
    criterion = nn.MSELoss()  # Mean Squared Error Loss for regression
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.reshape(-1,1).to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        val_loss = validate_model(model, val_loader, criterion)
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_loss:.4f}, Validation Loss: {val_loss:.4f}')


    return model

def validate_model(model, val_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch
            inputs = inputs.to(device)
            targets = targets.reshape(-1,1).to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    return avg_loss

def data_loader(train, val, batch_size=2048):
    # Load your dataset here and preprocess
    train_df = train.select('outcome', 'features').toPandas()
    val_df = val.select('outcome', 'features').toPandas()
    
    X_train = np.array(train_df['features'].apply(lambda x: x.toArray()).tolist())
    y_train = np.array(train_df['outcome'].values)
    X_val = np.array(val_df['features'].apply(lambda x: x.toArray()).tolist())
    y_val = np.array(val_df['outcome'].values)

    train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32))
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32), torch.tensor(y_val, dtype=torch.float32))
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    return train_loader, val_loader

In [11]:
def load_data(hdfs_data_path):
    # merge csvs and read data
# fifa data folder should contain all the csv files from Fifa(Kaggle), 2015-2022
# assume that you are working in the same directory as the data folder

    # Read CSV files from HDFS
    csv_files = spark.sparkContext.wholeTextFiles(hdfs_data_path + "/*.csv").keys().collect()
    print(csv_files)

    combined_df = None

    for file in tqdm(csv_files):
        year = file.split("players_")[1].split(".csv")[0]
        df = spark.read.csv(file, header=True, inferSchema=True)
        df = df.withColumn("year", lit(int(year))) # Add 'year' column
        if combined_df is None:
            combined_df = df
        else:
            combined_df = combined_df.union(df)

    # Add a unique ID column
    combined_df = combined_df.withColumn("id", monotonically_increasing_id())
    
    return combined_df

In [12]:
# HDFS path to 'fifadata' folder
hdfs_data_path = "hdfs:///fifadata"  # HDFS path
data = load_data(hdfs_data_path)

                                                                                

['hdfs://cluster-14ef-m/fifadata/players_15.csv', 'hdfs://cluster-14ef-m/fifadata/players_17.csv', 'hdfs://cluster-14ef-m/fifadata/players_19.csv', 'hdfs://cluster-14ef-m/fifadata/players_20.csv', 'hdfs://cluster-14ef-m/fifadata/players_16.csv', 'hdfs://cluster-14ef-m/fifadata/players_18.csv', 'hdfs://cluster-14ef-m/fifadata/players_21.csv', 'hdfs://cluster-14ef-m/fifadata/players_22.csv']


100%|██████████| 8/8 [00:19<00:00,  2.49s/it]                                   


In [13]:
# get the pipeline 
pipeline = get_preprocess_pipeline()
pipeline_model = pipeline.fit(data)
data = pipeline_model.transform(data)

train, test = data.randomSplit([0.8, 0.2], seed=SEED)
train, val = train.randomSplit([0.75, 0.25], seed=SEED)

train.cache()
val.cache()
test.cache()

                                                                                

['value_eur', 'wage_eur', 'club_team_id', 'league_level', 'pace', 'shooting', 'passing', 'dribbling', 'defending', 'physic']


                                                                                

['LB', 'CDM', 'LWB', 'RB', 'CB', 'RW', 'RM', 'RWB', 'GK', 'CAM', 'CF', 'LM', 'LW', 'CM', 'ST']


100%|██████████| 15/15 [00:00<00:00, 32.93it/s]
23/11/16 18:52:13 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

['LB', 'CDM', 'LWB', 'RB', 'CB', 'RW', 'RM', 'RWB', 'GK', 'CAM', 'CF', 'LM', 'LW', 'CM', 'ST']


100%|██████████| 15/15 [00:00<00:00, 16.31it/s]
                                                                                

DataFrame[outcome: int, club_joined: timestamp, mentality_composure: string, features: vector]

### Training pipeline

In [14]:
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator, BinaryClassificationEvaluator
import matplotlib.pyplot as plt
class PySparkMLModel:
    def __init__(self, model_type="logistic", learning_rate=0.01, is_wandb=False, is_plot=False):
        self.model_type = model_type
        self.learning_rate = learning_rate
        self.model = None
        self.evaluator = RegressionEvaluator(labelCol="outcome", predictionCol="prediction", metricName="rmse")
        self.wandb = is_wandb
        self.is_plot = is_plot

        # Initialize Weights & Biases
        if self.wandb: 
            wandb.init(project="pyspark_ml_model", entity="your_username")
            wandb.config.update({"learning_rate": self.learning_rate})

    def train(self, train_data):
        if self.model_type == "linear":
            self.model = LinearRegression(featuresCol="features", labelCol="outcome", regParam=self.learning_rate)
        elif self.model_type == "decision_tree":
            self.model = DecisionTreeRegressor(featuresCol="features", labelCol="outcome")
        else:
            raise ValueError("Unsupported model type")

        model = self.model.fit(train_data)
        return model

    def evaluate(self, model, data, data_type="test"):
        predictions = model.transform(data)
        metric = self.evaluator.evaluate(predictions)
        
        print(f"RMSE on {data_type} data = {metric}")

        # Log metrics to wandb
        if self.wandb: 
            wandb.log({f"{self.model_type}_{data_type}_rmse": metric})

        # Visualizations and additional logging can be added here as needed
        predictions = model.transform(data)
        if self.is_plot:
            self.plot_residuals(predictions)
            self.log_feature_importance(model)
        
    def plot_residuals(self, predictions):
        import seaborn as sns
        import matplotlib.pyplot as plt
        
        # Convert to Pandas DataFrame for easier plotting
        predictions_df = predictions.select("prediction", "outcome").toPandas()
        
        # Calculate residuals
        predictions_df['residuals'] = predictions_df['outcome'] - predictions_df['prediction']

        # Plot residuals
        plt.figure(figsize=(10, 6))
        sns.residplot(x='prediction', y='residuals', data=predictions_df, lowess=True, 
                      line_kws={'color': 'red', 'lw': 1})
        plt.title('Residuals vs Predicted')
        plt.xlabel('Predicted Values')
        plt.ylabel('Residuals')
        plt.axhline(y=0, color='black', linestyle='--')
        plt.show()
        if self.wandb:
            wandb.log({"residuals_plot": wandb.Image(plt)})
    
    def log_feature_importance(self, model):
        import seaborn as sns
        import matplotlib.pyplot as plt
        
        if self.model_type == "decision_tree":
            # Get feature importances
            feature_importances = model.featureImportances.toArray()

            # Plot feature importances
            plt.figure(figsize=(10, 6))
            sns.barplot(x=list(range(len(feature_importances))), y=feature_importances)
            plt.title('Feature Importances')
            plt.xlabel('Features')
            plt.ylabel('Importance')
            plt.show()
            if self.wandb:
                # Log feature importances
                wandb.log({"feature_importances": feature_importances})
                wandb.log({"feature_importances_plot": wandb.Image(plt)})


    def save_model(self, model, model_path):
        model.write().overwrite().save(model_path)
        if self.wandb: 
            wandb.save(model_path)

    def close(self):
        wandb.finish()

    def run_pipeline(self, train, val, test):
        model = self.train(train)
        self.evaluate(model, val, "validation")
        self.evaluate(model, test, "test")
        self.save_model(model, f"{self.model_type}_model")
        if self.wandb: 
            self.close()


In [15]:
# Training
if IS_SPARKML:
    logistic_model = PySparkMLModel(model_type="linear", learning_rate=0.01, is_plot=False)
    logistic_model.run_pipeline(train, val, test)

    dt_model = PySparkMLModel(model_type="decision_tree", is_plot=False)
    dt_model.run_pipeline(train, val, test)

                                                                                

RMSE on validation data = 1.649044212810787


                                                                                

RMSE on test data = 1.6798007229413023


                                                                                

RMSE on validation data = 1.8573776345569482
RMSE on test data = 1.869825649920477


                                                                                

### Pytorch Pipeline

In [16]:
# Downsample all data for training 
downsample_train = train.sample(False, 0.1)
downsample_val = val.sample(False, 0.1)

In [17]:
downsample_train.count()

                                                                                

8521

In [18]:
train_loader_MLP, val_loader_MLP = data_loader(train=train, val=val, batch_size=256)
input_dim = train.select('features').first()[0].size
model_list = [
    MLPRegressor(input_size=input_dim, hidden_sizes=[512, 256, 64], output_size=1),
    #TransformerRegressor(input_size=input_dim, d_model=512, nhead=8, num_layers=6, output_size=1)
    MLPResidualRegressor(input_size=input_dim, hidden_sizes=[1024,512,256,64], output_size=1)
    ]

for model in model_list:
    if model.__class__.__name__ == 'MLPRegressor':
        print(f"Training {model.__class__.__name__} model, {model.__class__.__name__ == 'MLPRegressor'}")
        trained_model = train_model(model, train_loader_MLP, val_loader_MLP, epochs=10, learning_rate=0.0003)
        torch.save(trained_model.state_dict(), f"{model.__class__.__name__}.pt")
        trained_model.eval()
        test_loss = validate_model(model, val_loader_MLP, nn.MSELoss())
        print(f"Test loss for {model.__class__.__name__} model = {test_loss}")
    else: 
        print(f"Training {model.__class__.__name__} model, {model.__class__.__name__ == 'MLPResidualRegressor'}")
        trained_model = train_model(model, train_loader_MLP, val_loader_MLP, epochs=10, learning_rate=0.001)
        torch.save(trained_model.state_dict(), f"{model.__class__.__name__}.pt")
        trained_model.eval()
        test_loss = validate_model(model, val_loader_MLP, nn.MSELoss())
        print(f"Test loss for {model.__class__.__name__} model = {test_loss}")
    del trained_model 
    #torch.cuda.empty_cache()
    gc.collect()

                                                                                

Training MLPRegressor model, True


100%|██████████| 333/333 [00:03<00:00, 91.36it/s] 


Epoch 1/10, Train Loss: 324.0882, Validation Loss: 5.3124


100%|██████████| 333/333 [00:02<00:00, 146.60it/s]


Epoch 2/10, Train Loss: 4.5760, Validation Loss: 3.7261


100%|██████████| 333/333 [00:02<00:00, 133.81it/s]


Epoch 3/10, Train Loss: 3.1283, Validation Loss: 2.4714


100%|██████████| 333/333 [00:02<00:00, 136.95it/s]


Epoch 4/10, Train Loss: 2.1354, Validation Loss: 1.8481


100%|██████████| 333/333 [00:02<00:00, 130.96it/s]


Epoch 5/10, Train Loss: 1.6743, Validation Loss: 1.5745


100%|██████████| 333/333 [00:02<00:00, 137.34it/s]


Epoch 6/10, Train Loss: 1.4481, Validation Loss: 1.3337


100%|██████████| 333/333 [00:02<00:00, 130.88it/s]


Epoch 7/10, Train Loss: 1.2808, Validation Loss: 1.2157


100%|██████████| 333/333 [00:02<00:00, 133.45it/s]


Epoch 8/10, Train Loss: 1.1568, Validation Loss: 1.1413


100%|██████████| 333/333 [00:02<00:00, 136.23it/s]


Epoch 9/10, Train Loss: 1.0847, Validation Loss: 1.0211


100%|██████████| 333/333 [00:02<00:00, 126.68it/s]


Epoch 10/10, Train Loss: 0.9756, Validation Loss: 0.9347
Test loss for MLPRegressor model = 0.9346799701452255
Training MLPResidualRegressor model, True


100%|██████████| 333/333 [00:08<00:00, 37.22it/s]


Epoch 1/10, Train Loss: 63.2848, Validation Loss: 2.8400


100%|██████████| 333/333 [00:08<00:00, 37.09it/s]


Epoch 2/10, Train Loss: 2.0971, Validation Loss: 1.7931


100%|██████████| 333/333 [00:12<00:00, 27.20it/s]


Epoch 3/10, Train Loss: 1.5097, Validation Loss: 1.2511


100%|██████████| 333/333 [00:12<00:00, 27.24it/s]


Epoch 4/10, Train Loss: 1.2923, Validation Loss: 1.4898


100%|██████████| 333/333 [00:12<00:00, 27.51it/s]


Epoch 5/10, Train Loss: 1.2487, Validation Loss: 0.9873


100%|██████████| 333/333 [00:12<00:00, 27.36it/s]


Epoch 6/10, Train Loss: 1.2619, Validation Loss: 2.2133


100%|██████████| 333/333 [00:12<00:00, 27.46it/s]


Epoch 7/10, Train Loss: 1.6741, Validation Loss: 4.1413


100%|██████████| 333/333 [00:12<00:00, 27.19it/s]


Epoch 8/10, Train Loss: 1.4657, Validation Loss: 1.7551


100%|██████████| 333/333 [00:12<00:00, 27.73it/s]


Epoch 9/10, Train Loss: 1.5425, Validation Loss: 1.8964


100%|██████████| 333/333 [00:12<00:00, 26.96it/s]


Epoch 10/10, Train Loss: 1.5406, Validation Loss: 1.0111
Test loss for MLPResidualRegressor model = 1.0111230186053686


In [19]:
spark.stop()