# PREPROCESSING

In [8]:
from sklearn import preprocessing

# a function with different normalization and scaling techniques
def preprocess(X_train, X_test, modality):
    
    X_train_p = X_train
    X_test_p = X_test
        
    if modality == 'l2' or modality == 'l1':
        X_train_p = preprocessing.normalize(X_train, norm=modality)
        X_test_p = preprocessing.normalize(X_test, norm=modality)

    if modality == 'standard':
        scaler = preprocessing.StandardScaler()
        scaler.fit(X_train)
        X_train_p = scaler.transform(X_train)
        X_test_p = scaler.transform(X_test)

    if modality == 'min-max':
        scaler = preprocessing.MinMaxScaler()
        scaler.fit(X_train)
        X_train_p = scaler.transform(X_train)
        X_test_p = scaler.transform(X_test)

    return X_train_p, X_test_p


Cross-validation R-squared: 0.231 Preprocessing = No
Cross-validation R-squared: 0.231 Preprocessing = standard
Cross-validation R-squared: 0.231 Preprocessing = min-max

Best Preprocessing: No

Test set MSE: 84.08
Test set R-squared: 0.23


**Test con Linear Regression**

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import itertools

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

seed = 42

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Hyperparameters
pre = ['No', 'standard', 'min-max']
hyperparameters = itertools.product(pre)

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_preprocessing = 'No'

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

for preprocessing_type in pre:  # Rinomina la variabile per evitare conflitti
    # Preprocess
    X_train_p, X_val_p = preprocess(X_train, X_val, preprocessing_type)
    
    # Initialize Linear Regression model
    model = LinearRegression()
    
    # Cross-validate
    scores = cross_val_score(model, X_train_p, y_train, cv=cv, scoring=score)
    
    # Save best result so far
    avg_r2 = np.mean(scores)
    print("Cross-validation R-squared: {:.3f}".format(avg_r2), "Preprocessing =", preprocessing_type)
    
    if avg_r2 > best_r2:
        best_r2 = avg_r2
        best_preprocessing = preprocessing_type

print("\nBest Preprocessing:", best_preprocessing)

# Final training and testing
X_train_p, X_test_p = preprocess(X_train, X_test, best_preprocessing)
model = LinearRegression()
model.fit(X_train_p, y_train)

# Evaluation on the test set
y_test_pred = model.predict(X_test_p)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))

**Test con Random Forest Regression**

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing as sk_preprocessing
import numpy as np
import itertools

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

seed = 42

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Hyperparameters
pre = ['No', 'standard', 'min-max']
hyperparameters = itertools.product(pre)

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_preprocessing = 'No'

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

for preprocessing_type in pre:
    # Preprocess
    X_train_p, X_val_p = preprocess(X_train, X_val, preprocessing_type)
    
    # Initialize Random Forest Regressor model
    model = RandomForestRegressor(n_estimators=100, random_state=seed)  # Modifica il numero di stimatori se necessario
    
    # Cross-validate
    scores = cross_val_score(model, X_train_p, y_train, cv=cv, scoring=score)
    
    # Save best result so far
    avg_r2 = np.mean(scores)
    print("Cross-validation R-squared: {:.3f}".format(avg_r2), "Preprocessing =", preprocessing_type)
    
    if avg_r2 > best_r2:
        best_r2 = avg_r2
        best_preprocessing = preprocessing_type

print("\nBest Preprocessing:", best_preprocessing)

# Final training and testing
X_train_p, X_test_p = preprocess(X_train, X_test, best_preprocessing)
model = RandomForestRegressor(n_estimators=100, random_state=seed)  # Modifica il numero di stimatori se necessario
model.fit(X_train_p, y_train)

# Evaluation on the test set
y_test_pred = model.predict(X_test_p)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))


**Test con Support Vector Regression**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import itertools

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

seed = 42

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Hyperparameters
pre = ['No', 'standard', 'min-max']
Cs = [0.01, 0.1, 1, 10, 100]
kernels = ['linear', 'poly', 'rbf']
hyperparameters = itertools.product(kernels, pre, Cs)

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_C = 0
best_preprocessing = 'No'
best_kernel = 'linear'

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

for kernel, preprocessing, C in hyperparameters:
    # Preprocess
    X_train_p, X_val_p = preprocess(X_train, X_val, preprocessing)
    
    # Initialize SVR model
    model = SVR(kernel=kernel, C=C)
    
    # Cross-validate
    scores = cross_val_score(model, X_train_p, y_train, cv=cv, scoring=score)
    
    # Save best result so far
    avg_r2 = np.mean(scores)
    print("Cross-validation R-squared: {:.3f}".format(avg_r2), "Kernel =", kernel, "Preprocessing =", preprocessing, "C =", C)
    
    if avg_r2 > best_r2:
        best_r2 = avg_r2
        best_C = C
        best_preprocessing = preprocessing
        best_kernel = kernel

print("\nBest C value:", best_C)
print("Best Preprocessing:", best_preprocessing)
print("Best Kernel:", best_kernel)

# Final training and testing
X_train_p, X_test_p = preprocess(X_train, X_test, best_preprocessing)
model = SVR(kernel=best_kernel, C=best_C)
model.fit(X_train_p, y_train)

# Evaluation on the test set
y_test_pred = model.predict(X_test_p)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))


**Test con KNN Regressor**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing as sk_preprocessing
import numpy as np
import itertools

# Caricamento dei dati
FILENAME = "train.csv"
df = pd.read_csv(FILENAME)

seed = 42

# Seleziona le variabili di input (X) e output (y)
X = df.drop("Year", axis=1)
y = df["Year"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

# Hyperparameters
pre = ['No', 'standard', 'min-max']
hyperparameters = itertools.product(pre)

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_preprocessing = 'No'

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

for preprocessing_type in pre:
    # Preprocess
    X_train_p, X_val_p = preprocess(X_train, X_val, preprocessing_type)
    
    # Initialize KNN Regressor model
    model = KNeighborsRegressor(n_neighbors=5)  # Modifica il numero di vicini se necessario
    
    # Cross-validate
    scores = cross_val_score(model, X_train_p, y_train, cv=cv, scoring=score)
    
    # Save best result so far
    avg_r2 = np.mean(scores)
    print("Cross-validation R-squared: {:.3f}".format(avg_r2), "Preprocessing =", preprocessing_type)
    
    if avg_r2 > best_r2:
        best_r2 = avg_r2
        best_preprocessing = preprocessing_type

print("\nBest Preprocessing:", best_preprocessing)

# Final training and testing
X_train_p, X_test_p = preprocess(X_train, X_test, best_preprocessing)
model = KNeighborsRegressor(n_neighbors=5)  # Modifica il numero di vicini se necessario
model.fit(X_train_p, y_train)

# Evaluation on the test set
y_test_pred = model.predict(X_test_p)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))
