In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import CrossEntropyLoss
from sklearn.model_selection import KFold

## Pre-Process Train Data 

In [3]:
train_dataset = pd.read_csv("../Dataset_DAY1/Data/train_set.csv", delimiter=';')

In [4]:
train_dataset.head()

Unnamed: 0,application_ID,decision_date,company_ID,external_score_ver01,external_score_ver02,late_payment_score,external_score_late_payment_integrated,external_score_moderate,external_score_adverse,external_score_ver03,...,avg_count_enti_affidanti,std_count_enti_affidanti,max_count_enti_affidanti,last_count_enti_affidanti,avg_count_numero_prima_info,std_count_numero_prima_info,max_count_numero_prima_info,last_count_numero_prima_info,days_to_default,target
0,a1Q7R00000ej2yjUAA,2021-11-30,7256587870,10,3,,,,,MISSING,...,1,0,1,1,0,0,0,0,522,1
1,a1Q2X00000ZWC5LUAX,2020-10-06,6178307100,7,3,,,,,H,...,1,0,1,1,2,0,2,2,1498,0
2,a1Q2X00000XcCCQUA3,2020-02-11,7692855390,7,3,,,,,MISSING,...,1,0,1,1,1,0,1,1,779,1
3,a1Q7R00000ejSs3UAE,2022-01-18,5752241730,8,2,,,,,MISSING,...,1,0,1,1,5,522232967867094,1,0,1498,0
4,a1Q7R00000eiRidUAE,2021-09-16,7533506540,4,1,,,,,MISSING,...,0,0,0,0,0,0,0,0,1498,0


In [5]:
## drop features
def Drop_unneed_columns(dataset):
    cols= ['application_ID', 'company_ID', 'decision_date']
    dataset= dataset.drop(columns=cols)
    return dataset

In [6]:
def Nan_values(dataset):
    column_names = dataset.columns.tolist()
    drop_columns = []
    for name in column_names:
        nan_count = dataset[name].isna().sum()
        print(f"column {name}: {nan_count}")
        if (nan_count/28000) > 0.5:
            print(f"Number of NaN values in column '{name}': {nan_count}")
            drop_columns.append(name)
    return drop_columns

In [7]:
def Replace_cate_to_value(column_name, dataset):
    # Extract categories

    # Extract unique category names from the column
    unique_categories = dataset[column_name].unique()

    # convert 'numpy.ndarray' in to a python list
    l = unique_categories.tolist()
    
    if 'MISSING' in l:
        l.remove('MISSING')
        l.sort(reverse=True)

    print(l)
    
    dic = { l[i]:i+1 for i in range(0, len(l))}

    # Replace values in the column based on the dictionary mapping
    dataset[column_name] = dataset[column_name].replace(dic)
    return dic, dataset

In [8]:
def Category_values(dataset):
    column_names = ['industry_sector','region', 'geo_area','external_score_ver03', 'province','juridical_form']
    dic = {}
    for column_name in column_names:
        category_dic, dataset = Replace_cate_to_value(column_name, dataset)
        dic[column_name] = category_dic
    return dic, dataset

In [9]:
def Replace_bool_toNumbers(dataset):
    dataset['cr_available'] = [int(dataset['cr_available'][i]) for i in range(len(dataset['cr_available']))]
    dataset['cr_available']
    return dataset

In [10]:
def mean_var03(dataset):
    s0, s1, c0, c1 = 0,0,0,0
    # unique_labels = dataset['target'].unique()
    for index, row in dataset.iterrows():
        if row['external_score_ver03'] != 'MISSING':
            if row['target'] == 0:
                s0 += row['external_score_ver03']
                c0 +=1
            elif row['target'] == 1:
                s1 +=  row['external_score_ver03']
                c1 += 1

    m0 = round(s0/c0)
    m1 = round(s1/c1)
    print(m0)
    print(m1)
    return m0,m1

In [11]:
def Replace_missing(dataset, m0, m1):
    # Assuming df is your DataFrame and 'column_to_change' is the column you want to change
    # 'condition_column' is the column based on which you want to change the content
    dataset.loc[(dataset['target'] == 1) & (dataset['external_score_ver03'] == 'MISSING'), 'external_score_ver03'] = m1
    dataset.loc[(dataset['target'] == 0) & (dataset['external_score_ver03'] == 'MISSING'), 'external_score_ver03'] = m0
    dataset['external_score_ver03']

    # For example, if you want to change the content of 'column_to_change' to 'new_value' where 'condition_column' is True
    # Replace 'new_value', 'column_to_change', and 'condition_column' with your actual values
    return dataset

In [12]:
# Drop columns 
train_dataset = Drop_unneed_columns(train_dataset)
drop_columns = Nan_values(train_dataset)
train_dataset = train_dataset.drop(columns=drop_columns)

column external_score_ver01: 0
column external_score_ver02: 0
column late_payment_score: 27488
Number of NaN values in column 'late_payment_score': 27488
column external_score_late_payment_integrated: 27488
Number of NaN values in column 'external_score_late_payment_integrated': 27488
column external_score_moderate: 27208
Number of NaN values in column 'external_score_moderate': 27208
column external_score_adverse: 27208
Number of NaN values in column 'external_score_adverse': 27208
column external_score_ver03: 0
column age: 0
column province: 2654
column juridical_form: 0
column industry_sector: 0
column gross_margin_ratio: 0
column core_income_ratio: 0
column cash_asset_ratio: 0
column consolidated_liabilities_ratio: 0
column tangible_assets_ratio: 0
column revenues: 0
column cr_available: 0
column region: 0
column geo_area: 0
column last_statement_age: 0
column overrun_freq_a_revoca_autoliquidanti: 0
column avg_tension_a_revoca_autoliquidanti: 0
column std_tension_a_revoca_autoliqui

In [13]:
# replace bool values to numerical ones 
category_dics, train_dataset = Category_values(train_dataset)
train_dataset = Replace_bool_toNumbers(train_dataset)

['Servizi', 'Costruzioni e materiali per costruzioni', 'Distribuzione', 'Trasporti', 'Utility', 'Editoria e stampa', 'Elettrotecnica ed elettronica', 'Altri beni di consumo', 'Sistema moda', 'Metallurgia e prodotti in metallo', 'Meccanica', 'Chimica di base e intermedi', 'Elettrodomestici', 'Agricoltura', 'Alimentare', 'Energia ed estrazione', 'Largo consumo / attività ricreativo-culturali', 'Mezzi di trasporto', 'Holding, finanziarie ed altro', 'Farmaceutica']
['Sicilia', 'Sardegna', 'Puglia', 'Lazio', 'Veneto', 'Lombardia', 'Campania', 'Piemonte', 'Abruzzo', 'Basilicata', 'Emilia-Romagna', 'Umbria', 'Toscana', "Valle d'Aosta/Vallée d'Aoste", 'Calabria', 'Marche', 'Liguria', 'Molise', 'Friuli-Venezia Giulia', 'Trentino-Alto Adige/Südtirol']
['Isole', 'Sud', 'Centro', 'Nord-est', 'Nord-ovest']
['P', 'O', 'N', 'M', 'L', 'I', 'H', 'G', 'F', 'E', 'D', 'C', 'B', 'A']
['TP', 'CA', 'LE', 'RM', 'PD', 'MI', 'SA', 'TO', 'CH', 'PZ', 'LC', 'OR', 'LT', 'MO', 'PG', 'BO', 'TV', 'VE', 'BA', 'PI', 'PA

In [14]:
m0, m1= mean_var03(train_dataset)
train_dataset = Replace_missing(train_dataset, m0, m1)

8
10


In [15]:
def normalized_data(dataset):
    # Replace commas with periods in all columns
    dataset = dataset.replace(',', '.', regex=True)
    print(dataset.dtypes)
    dataset = dataset.astype('float32')

    # check if the dataset has any nan value
    has_nan_values = dataset.isna().any().any()

    if has_nan_values:
        print("DataFrame contains NaN values.")
    else:
        print("DataFrame does not contain any NaN values.")

    return dataset

In [16]:
train_dataset = normalized_data(train_dataset)
train_dataset.head()

external_score_ver01                      int64
external_score_ver02                      int64
external_score_ver03                      int64
age                                       int64
province                                  int64
juridical_form                            int64
industry_sector                           int64
gross_margin_ratio                       object
core_income_ratio                        object
cash_asset_ratio                         object
consolidated_liabilities_ratio           object
tangible_assets_ratio                    object
revenues                                 object
cr_available                              int64
region                                    int64
geo_area                                  int64
last_statement_age                        int64
overrun_freq_a_revoca_autoliquidanti     object
avg_tension_a_revoca_autoliquidanti      object
std_tension_a_revoca_autoliquidanti      object
max_tension_a_revoca_autoliquidanti     

Unnamed: 0,external_score_ver01,external_score_ver02,external_score_ver03,age,province,juridical_form,industry_sector,gross_margin_ratio,core_income_ratio,cash_asset_ratio,...,avg_count_enti_affidanti,std_count_enti_affidanti,max_count_enti_affidanti,last_count_enti_affidanti,avg_count_numero_prima_info,std_count_numero_prima_info,max_count_numero_prima_info,last_count_numero_prima_info,days_to_default,target
0,10.0,3.0,10.0,15.0,1.0,1.0,1.0,0.464637,0.012593,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,522.0,1.0
1,7.0,3.0,7.0,5.0,2.0,2.0,2.0,0.37234,0.115385,0.235955,...,1.0,0.0,1.0,1.0,2.0,0.0,2.0,2.0,1498.0,0.0
2,7.0,3.0,10.0,5.0,3.0,1.0,3.0,0.27,0.006369,0.359375,...,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,779.0,1.0
3,8.0,2.0,8.0,6.0,4.0,1.0,1.0,0.419929,0.152174,0.13615,...,1.0,0.0,1.0,1.0,0.5,0.522233,1.0,0.0,1498.0,0.0
4,4.0,1.0,8.0,5.0,5.0,1.0,1.0,0.526316,0.083333,0.233333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1498.0,0.0


## <font color="yellow"> SVM Training 

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# Assuming Y contains the target variable for regression

# Standardize the features (mean=0 and variance=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_dataset.drop(columns='days_to_default'))


# Standardize the target variable Y
Y = train_dataset['days_to_default']
scaler_Y = StandardScaler()
Y_scaled = scaler_Y.fit_transform(Y.values.reshape(-1, 1))  # Reshape Y to be a 2D array for StandardScaler


# Create PCA object
pca = PCA(n_components=30)  # Specify the number of components (desired dimensionality)

# Fit PCA to the standardized data and transform the data
X_pca = pca.fit_transform(X_scaled)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.1, random_state=2)

# Create SVR (Support Vector Regression) model
regressor = SVR(C=0.1, kernel='linear', gamma='scale')

# Fit the model on the training data
regressor.fit(X_train, Y_train)

# Predict on the testing data
Y_pred = regressor.predict(X_test)

# Calculate Mean Squared Error (MSE) as a metric
mse = mean_squared_error(Y_test, Y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 13191.330677910846


## <font color="yellow"> Random Forest Training

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

X = train_dataset.drop(columns=['days_to_default', 'target'])
y = train_dataset['days_to_default'] # labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_regressor = RandomForestRegressor()

rf_regressor.fit(X_train, y_train)

y_pred_rf = rf_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error:", mse)


Mean Squared Error: 112155.19969535676


In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Assuming 'days_to_default' is your target variable
X = train_dataset.drop(columns=['days_to_default', 'target'])
y = train_dataset['days_to_default']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
rf_regressor = RandomForestRegressor()

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_rf_regressor = RandomForestRegressor(**best_params)
best_rf_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = best_rf_regressor.predict(X_test)

# Calculate Mean Squared Error (MSE) as a metric
mse = mean_squared_error(y_test, y_pred_rf)
print("Mean Squared Error:", mse)
print("Best Hyperparameters:", best_params)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


KeyboardInterrupt: 

## KNN Training

In [19]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_dataset.drop(columns='days_to_default'))

# Standardize the target variable Y
Y = train_dataset['days_to_default']
scaler_Y = StandardScaler()
Y_scaled = scaler_Y.fit_transform(Y.values.reshape(-1, 1))
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_scaled, test_size=0.2, random_state=42)

# Initialize the KNeighborsRegressor
knn_regressor = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors as needed

# Train the model
knn_regressor.fit(X_train, y_train)

# Make predictions
y_pred_knn = knn_regressor.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
mse_knn = mean_squared_error(y_test, y_pred_knn)
print("MSE in KNN:", mse_knn)


MSE in KNN: 19985.281


## Pre-process Test Dataset

In [20]:
test_dataset = pd.read_csv("../Dataset_DAY1/Data/test_set.csv", delimiter=';')

In [21]:
test_dataset.head()

Unnamed: 0,application_ID,decision_date,company_ID,external_score_ver01,external_score_ver02,late_payment_score,external_score_late_payment_integrated,external_score_moderate,external_score_adverse,external_score_ver03,...,max_rel_used_a_scadenza,last_rel_used_a_scadenza,avg_count_enti_affidanti,std_count_enti_affidanti,max_count_enti_affidanti,last_count_enti_affidanti,avg_count_numero_prima_info,std_count_numero_prima_info,max_count_numero_prima_info,last_count_numero_prima_info
0,a1Q7R00000ZWFXwUAP,2020-10-12,1321219660,5,1,8.0,5.0,6.0,7.0,D,...,132464142538975,132464142538975,116666666666667,389249472080761,2,2,108333333333333,288675134594813,2,1
1,a1Q7R00000ZWJX2UAP,2020-11-12,1420617490,8,1,,,,,F,...,0,0,1,0,1,1,1,0,1,1
2,a1Q7R00000a3E9nUAE,2021-07-05,137667970,8,1,,,,,I,...,33222009569378,320196172248804,3,0,3,3,191666666666667,288675134594813,2,2
3,a1Q7R00000ZWRR6UAP,2021-01-19,137667970,8,1,,,,,F,...,162240829346092,126861244019139,3,0,3,3,191666666666667,288675134594813,2,2
4,a1Q7R00000g6DWvUAM,2022-05-09,2412739090,6,1,,,,,F,...,478388926862611,356901572112098,3,0,3,3,208333333333333,288675134594813,3,3


In [22]:
test_dataset = Drop_unneed_columns(test_dataset)
test_dataset = test_dataset.drop(columns=drop_columns)

In [23]:
category_dics["juridical_form"]["SS"] = 15
category_dics["juridical_form"]["OS"] = 16
for k,v in category_dics.items():
    test_dataset.replace({k:v}, inplace=True)

In [24]:
# find columns with MISSING values 
columns = []
for column in list(test_dataset.columns):
    # Check if there is a value "MISSING" in the 'column_name' column
    missing_values = test_dataset[column] == 'MISSING'

    # Check if any row contains the value "MISSING" in the specified column
    if missing_values.any():
        print(f"'MISSING' in the column: {column}")
        columns.append(column)

# Sum values in the specified columns
dic = {}
for column in columns:
    column_name = column

    count = 0
    sum_values = 0
    # Iterate over the DataFrame
    for index, row in test_dataset.iterrows():
        # Access the value of the specified column for each row
        count +=1
        if isinstance(row[column_name], str):
            continue
        elif isinstance(row[column_name], int):
            sum_values += row[column_name]
    
    dic[column] = int(sum_values/count)

'MISSING' in the column: external_score_ver03
'MISSING' in the column: province
'MISSING' in the column: region
'MISSING' in the column: geo_area


In [25]:
def Replace_missing_test(dataset,val, column):
    # Assuming df is your DataFrame and 'column_to_change' is the column you want to change
    # 'condition_column' is the column based on which you want to change the content
    dataset.loc[(dataset[column] == 'MISSING'), column] = val


    # For example, if you want to change the content of 'column_to_change' to 'new_value' where 'condition_column' is True
    # Replace 'new_value', 'column_to_change', and 'condition_column' with your actual values
    return dataset

In [26]:
for k,v in dic.items():
    test_dataset = Replace_missing_test(test_dataset,v,k)

In [27]:
test_dataset = Replace_bool_toNumbers(test_dataset)

In [28]:
## normalise test dataset 
def normalized_tdata(dataset):
    # Replace commas with periods in all columns
    dataset = dataset.replace(',', '.', regex=True)
    print(dataset.dtypes)
    dataset = dataset.astype('float32')

    # check if the dataset has any nan value
    has_nan_values = dataset.isna().any().any()

    if has_nan_values:
        print("DataFrame contains NaN values.")
    else:
        print("DataFrame does not contain any NaN values.")

    return dataset

test_dataset = normalized_tdata(test_dataset)

external_score_ver01                      int64
external_score_ver02                      int64
external_score_ver03                      int64
age                                       int64
province                                  int64
juridical_form                            int64
industry_sector                           int64
gross_margin_ratio                       object
core_income_ratio                        object
cash_asset_ratio                         object
consolidated_liabilities_ratio           object
tangible_assets_ratio                    object
revenues                                 object
cr_available                              int64
region                                    int64
geo_area                                  int64
last_statement_age                        int64
overrun_freq_a_revoca_autoliquidanti     object
avg_tension_a_revoca_autoliquidanti      object
std_tension_a_revoca_autoliquidanti      object
max_tension_a_revoca_autoliquidanti     

In [29]:
test_dataset.head()
print(test_dataset.shape)

(10678, 39)


## <font color="yellow"> SVM Predictions

In [30]:
import pandas as pd

In [31]:
#Standardize the features (mean=0 and variance=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_dataset)

# Create PCA object
pca = PCA(n_components=30)  # Specify the number of components (desired dimensionality)

# Fit PCA to the standardized data and transform the data
X_pca = pca.fit_transform(X_scaled)

# Accuracy score on test data
X_test_prediction_final = regressor.predict(X_pca)



# Assuming X_test_prediction_final contains your predictions
# Convert predictions to integers
X_test_prediction_final_int = X_test_prediction_final.astype(int)

# Create a DataFrame with the integer predictions
predictions_SVM = pd.DataFrame(X_test_prediction_final_int)

# Write the DataFrame to a CSV file
predictions_SVM.to_csv('predictions_rg_SVM.csv', index=False)

## <font color="yellow"> Random Forest Predictions  

In [32]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_dataset)
#print(X_scaled)

# Predict on the scaled test data
X_test_prediction_final = rf_regressor.predict(test_dataset)

# Convert predictions to integers
X_test_prediction_final_int = X_test_prediction_final.astype(int)

# Create a DataFrame with the integer predictions
predictions_RF = pd.DataFrame(X_test_prediction_final_int)

# Write the DataFrame to a CSV file
predictions_RF.to_csv('predictions_rg_rf.csv', index=False)
