## Load Dataset 

In [None]:
import pandas as pd 

FILENAME = "train.csv"
df = pd.read_csv(FILENAME)x

## Dataset Info 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

duplicates = df.duplicated()
print("Number of duplicate rows: ", duplicates.sum())

df = df.drop_duplicates()


X = df.drop("Year", axis=1)
y = df["Year"]

print("Questa è la variabile X  \n")
print(X.shape)
print("\n\nQuesta è la variabile y  \n")
print(y.shape)
total_nan_rows = df.isna().any(axis=1).sum()


print("Number of Nan Rows: ", total_nan_rows)


num_rows, num_cols = df.shape
print("Number of rows: ", num_rows)
print("Number of columns: ", num_cols)

plt.hist(df['Year'])
plt.xlabel('Anno di pubblicazione')
plt.ylabel('Frequenza')
plt.title('Distribuzione anno di pubblicazione')
plt.show()

matrix_corr = df.corr()
plt.figure(figsize=(8,6))
sns.heatmap(matrix_corr, annot=False, cmap='coolwarm', fmt=".2f",linewidths=.5, xticklabels=False, yticklabels=False, cbar=False)
plt.title('Matrice di Correlazione')
plt.show()

matrix_corr


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
sns.boxplot(X)
plt.title('Box Plot delle Features')
plt.show()


## PreProcess


In [None]:
from sklearn import preprocessing
import pickle
# a function with different normalization and scaling techniques
def preprocessTrain(X_train, X_test, modality):
    
    X_train_p = X_train
    X_test_p = X_test

    if modality == 'standard':
        file = open(file="standardScaler.save", mode="wb")
        scaler = preprocessing.StandardScaler()
        scaler.fit(X_train)
        pickle.dump(obj=scaler, file=file)
        X_train_p = scaler.transform(X_train)
        X_test_p = scaler.transform(X_test)
        
    if modality == 'min-max':
        file = open(file="minMaxScaler.save", mode="wb")
        scaler = preprocessing.MinMaxScaler()
        scaler.fit(X_train)
        pickle.dump(obj=scaler, file=file)
        X_train_p = scaler.transform(X_train)
        X_test_p = scaler.transform(X_test)
        
    return X_train_p, X_test_p



## Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import itertools
from scipy import stats

seed = 42

# Esegui lo split in set di addestramento, di test e di validazione
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)

pre = ['No', 'standard', 'min-max']
hyperparameters = itertools.product(pre)

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_preprocessing = 'No'
best_model = None

for preprocessing_type in pre:
    # Preprocess
    X_train_p, X_val_p = preprocessTrain(X_train, X_val, preprocessing_type)
    
    # Initialize Linear Regression model
    model = LinearRegression()
    
    # Train the model on the training set
    model.fit(X_train_p, y_train)
    
    # Evaluate the model on the validation set
    val_r2 = score(model, X_val_p, y_val)
    
    print("Validation R-squared: {:.3f}".format(val_r2), "Preprocessing =", preprocessing_type)
    
    # Save best result so far
    if val_r2 > best_r2:
        best_r2 = val_r2
        best_preprocessing = preprocessing_type
        best_model = model

print("\nBest Preprocessing:", best_preprocessing)


# Final training and testing
X_train_p, X_test_p = preprocessTrain(X_train, X_test, "standard")
model = LinearRegression()
model.fit(X_train_p, y_train)

file = open("regression.save","wb")
pickle.dump(model, file)
file.close()

y_test_pred = model.predict(X_test_p)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
mae_test = np.mean(np.abs(y_test - y_test_pred))
mape_test = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))
print("Test set MAE: {:.2f}".format(mae_test))
print("Test set MAPE: {:.2f}%".format(mape_test))

## Random Forest 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import itertools
import pickle

seed = 42

# Supponendo che tu abbia già definito X e y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

pre = ['No', 'standard', 'min-max']
n_estimators_values = [50, 100, 150]  # Modifica il numero di stimatori se necessario
hyperparameters = list(itertools.product(pre, n_estimators_values))

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_preprocessing = 'No'
best_n_estimators = 0
best_model = None

for preprocessing_type, n_estimators in hyperparameters:
    # Preprocess
    X_train_p, X_val_p = preprocessTrain(X_train, X_val, preprocessing_type)
    
    # Initialize Random Forest Regressor model
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    
    # Train the model on the training set
    model.fit(X_train_p, y_train)
    
    # Evaluate the model on the validation set
    val_r2 = score(model, X_val_p, y_val)
    
    print("Validation R-squared: {:.3f}".format(val_r2), "Preprocessing =", preprocessing_type, "n_estimators =", n_estimators)
    
    # Save best result so far
    if val_r2 > best_r2:
        best_r2 = val_r2
        best_preprocessing = preprocessing_type
        best_n_estimators = n_estimators
        best_model = model

print("\nBest Preprocessing:", best_preprocessing)
print("Best n_estimators:", best_n_estimators)

# Final training and testing with best hyperparameters
X_train_p_best, X_test_p_best = preprocessTrain(X_train, X_test, best_preprocessing)
final_model = RandomForestRegressor(n_estimators=best_n_estimators, random_state=42)
final_model.fit(X_train_p_best, y_train)

# Save the trained model to a file
with open("randomForestReg.save", "wb") as file:
    pickle.dump(final_model, file)

# Predict on the test set
y_test_pred = final_model.predict(X_test_p_best)

# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
mae_test = np.mean(np.abs(y_test - y_test_pred))
mape_test = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))
print("Test set MAE: {:.2f}".format(mae_test))
print("Test set MAPE: {:.2f}%".format(mape_test))


## SVR


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import itertools
import pickle

seed = 42

# Supponendo che tu abbia già definito X e y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

Cs = [0.01, 0.1, 1, 10, 100]
kernels = ['linear', 'poly', 'rbf']
hyperparameters = list(itertools.product(kernels, Cs))

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_C = 0
best_kernel = 'linear'
best_model = None

for kernel, C in hyperparameters:
    # Preprocess
    X_train_p, X_val_p = preprocessTrain(X_train, X_val, "min_max")
    
    # Initialize SVR model
    model = SVR(kernel=kernel, C=C)
    
    # Train the model on the training set
    model.fit(X_train_p, y_train)
    
    # Evaluate the model on the validation set
    val_r2 = score(model, X_val_p, y_val)
    
    print("Validation R-squared: {:.3f}".format(val_r2), "Kernel =", kernel, "C =", C)
    
    # Save best result so far
    if val_r2 > best_r2:
        best_r2 = val_r2
        best_C = C
        best_kernel = kernel
        best_model = model

print("\nBest C value:", best_C)
print("Best Kernel:", best_kernel)

# Final training and testing with best hyperparameters
X_train_p_best, X_test_p_best = preprocessTrain(X_train, X_test, "min_max")
final_model = SVR(kernel=best_kernel, C=best_C)
final_model.fit(X_train_p_best, y_train)

# Save the trained model to a file
with open("svr_model.save", "wb") as file:
    pickle.dump(final_model, file)

# Predict on the test set
y_test_pred = final_model.predict(X_test_p_best)

# Evaluate the model on the test set
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
mae_test = np.mean(np.abs(y_test - y_test_pred))
mape_test = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))
print("Test set MAE: {:.2f}".format(mae_test))
print("Test set MAPE: {:.2f}%".format(mape_test))


## KNN Regressor 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing as sk_preprocessing
import numpy as np
import itertools
from scipy import stats
from sklearn.model_selection import GridSearchCV

seed = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

pre = ['No', 'standard', 'min-max']
n_neighbors_values = [5, 7, 9, 11]  # Modifica i valori secondo le tue esigenze
hyperparameters = list(itertools.product(pre, n_neighbors_values))

def score(model, X, y):
    y_pred = model.predict(X)
    return r2_score(y, y_pred)

best_r2 = -np.inf
best_preprocessing = 'No'
best_n_neighbors = 0
best_model = None

cv = KFold(n_splits=5, shuffle=True, random_state=42)

for preprocessing_type, n_neighbors in hyperparameters:
    # Preprocess
    X_train_p, X_val_p = preprocessTrain(X_train, X_val, preprocessing_type)
    
    # Initialize KNN Regressor model
    model = KNeighborsRegressor(n_neighbors=n_neighbors)
    
    # Train the model on the training set
    model.fit(X_train_p, y_train)
    
    # Evaluate the model on the validation set
    val_r2 = score(model, X_val_p, y_val)
    
    print("Validation R-squared: {:.3f}".format(val_r2), "Preprocessing =", preprocessing_type, "n_neighbors =", n_neighbors)
    
    # Save best result so far
    if val_r2 > best_r2:
        best_r2 = val_r2
        best_preprocessing = preprocessing_type
        best_n_neighbors = n_neighbors
        best_model = model

print("\nBest Preprocessing:", best_preprocessing)
print("Best n_neighbors:", best_n_neighbors)


X_train_p, X_test_p = preprocessTrain(X_train, X_test, "min-max")
model = KNeighborsRegressor(n_neighbors=11)
model.fit(X_train_p, y_train)

file = open("kNeighborReg.save","wb")
pickle.dump(model, file)
file.close()

y_test_pred = model.predict(X_test_p)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)
mae_test = np.mean(np.abs(y_test - y_test_pred))
mape_test = np.mean(np.abs((y_test - y_test_pred) / y_test)) * 100

# Print results
print("\nTest set MSE: {:.2f}".format(mse_test))
print("Test set R-squared: {:.2f}".format(r2_test))
print("Test set MAE: {:.2f}".format(mae_test))
print("Test set MAPE: {:.2f}%".format(mape_test))
