In [None]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

## Load Training Data

In [None]:
train_dataset = pd.read_csv("/Users/andreasalinetti/Documents/HACK4SDS/Dataset_DAY1/Data/train_set.csv", delimiter=';')

In [None]:
train_dataset.head() 

## Analizing Data

In [None]:
import seaborn as sns

In [None]:
train_dataset.describe()

In [None]:
#train_dataset.values.sum()

In [None]:
sns.countplot(x='external_score_ver03', hue= 'juridical_form', data= train_dataset)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is your DataFrame
# df = pd.read_csv("your_dataset.csv")

# Plot the distribution using Seaborn
plt.figure(figsize=(10, 6))
sns.countplot(data=train_dataset, x='external_score_ver03')
plt.xlabel('Category Value')
plt.ylabel('Count')
plt.title('Distribution of external_score_ver03')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to prevent overlapping labels
plt.show()

## <font color="yellow"> Clean and encode Data

#### Drop unneeded columns

In [None]:
## drop features
def Drop_unneed_columns(test, dataset):
    cols= ['days_to_default', 'application_ID', 'decision_date', 'company_ID']
    if test:
        cols.remove('days_to_default')
        dataset= dataset.drop(columns=cols)
    else:
        dataset= dataset.drop(columns=cols)
    return dataset


#### Find columns with to many Nan's values

In [None]:
def Nan_values(dataset):
    column_names = dataset.columns.tolist()
    drop_columns = []
    for name in column_names:
        nan_count = dataset[name].isna().sum()
        print(f"column {name}: {nan_count}")
        if (nan_count/28000) > 0.5:
            print(f"Number of NaN values in column '{name}': {nan_count}")
            drop_columns.append(name)
    return drop_columns

#### Label enconding

In [None]:
def Replace_cate_to_value(column_name, dataset):
    # Extract categories

    # Extract unique category names from the column
    unique_categories = dataset[column_name].unique()

    # convert 'numpy.ndarray' in to a python list
    l = unique_categories.tolist()
    
    if 'MISSING' in l:
        l.remove('MISSING')
        l.sort(reverse=True)
    # print(unique_categories)

    # print(f"remove{l}")
    dic = { l[i]:i+1 for i in range(0, len(l))}

    # dic = {}

    # for name in unique_categories:
    #     if name != "MISSING":
    #         dic{}
    # print(dic)

    # Replace values in the column based on the dictionary mapping
    dataset[column_name] = dataset[column_name].replace(dic)
    return dic, dataset

In [None]:
def Category_values(dataset):
    column_names = ['industry_sector', 'region', 'geo_area','external_score_ver03', 'province','juridical_form']
    dic = {}
    for column_name in column_names:
        category_dic, dataset = Replace_cate_to_value(column_name, dataset)
        dic[column_name] = category_dic
    return dic, dataset

#### Replace True and False values to numerical values in Columns

In [None]:
def Replace_bool_toNumbers(dataset):
    dataset['cr_available'] = [int(dataset['cr_available'][i]) for i in range(len(dataset['cr_available']))]
    dataset['cr_available']
    return dataset

#### Mean of external score var 03 

In [None]:
def mean_var03(dataset):
    s0, s1, c0, c1 = 0,0,0,0
    # unique_labels = dataset['target'].unique()
    for index, row in dataset.iterrows():
        if row['external_score_ver03'] != 'MISSING':
            if row['target'] == 0:
                s0 += row['external_score_ver03']
                c0 +=1
            elif row['target'] == 1:
                s1 +=  row['external_score_ver03']
                c1 += 1

    m0 = round(s0/c0)
    m1 = round(s1/c1)
    print(m0)
    print(m1)
    return m0,m1


#### Replace MISSING values to Mean finded 

In [None]:
def Replace_missing(dataset, m0, m1):
    # Assuming df is your DataFrame and 'column_to_change' is the column you want to change
    # 'condition_column' is the column based on which you want to change the content
    dataset.loc[(dataset['target'] == 1) & (dataset['external_score_ver03'] == 'MISSING'), 'external_score_ver03'] = m1
    dataset.loc[(dataset['target'] == 0) & (dataset['external_score_ver03'] == 'MISSING'), 'external_score_ver03'] = m0
    dataset['external_score_ver03']

    # For example, if you want to change the content of 'column_to_change' to 'new_value' where 'condition_column' is True
    # Replace 'new_value', 'column_to_change', and 'condition_column' with your actual values
    return dataset

## <font color="green"> Main code for train dataset 

In [None]:
# Drop columns 
train_dataset = Drop_unneed_columns(False,train_dataset)
drop_columns = Nan_values(train_dataset)
train_dataset = train_dataset.drop(columns=drop_columns)



In [None]:
# replace bool values to numerical ones 
category_dics, train_dataset = Category_values(train_dataset)
train_dataset = Replace_bool_toNumbers(train_dataset)


In [None]:
# v03 column with missing values 
m0, m1= mean_var03(train_dataset)
train_dataset = Replace_missing(train_dataset, m0, m1)

## <font color="yellow"> Normalise Datase

#### Replace the "," to ".", in such a way to pass from object to number

In [None]:
def normalized_data(dataset):
    # Replace commas with periods in all columns
    dataset = dataset.replace(',', '.', regex=True)
    print(dataset.dtypes)
    dataset = dataset.astype('float32')

    # check if the dataset has any nan value
    has_nan_values = dataset.isna().any().any()

    if has_nan_values:
        print("DataFrame contains NaN values.")
    else:
        print("DataFrame does not contain any NaN values.")

    return dataset

#### Normalise Columns

## <font color="green"> Main code Normalise Dataset

In [None]:
train_dataset = normalized_data(train_dataset)

In [None]:
pd.set_option('display.max_columns', None)
train_dataset.head(30)

## <font color="yellow"> Build a Balanced Dataset

In [None]:
def split_dataframe_by_label(df, label_column, label_value_1, label_value_2, sample_size):
    # Separate the DataFrame based on the labels
    subset_1 = df[df[label_column] == label_value_1]
    subset_2 = df[df[label_column] == label_value_2]
    
    # Take a random sample of rows from each subset
    subset_1_sampled = subset_1.sample(n=sample_size, random_state=42)
    subset_2_sampled = subset_2.sample(n=sample_size, random_state=42)
    
    # Concatenate the sampled subsets to form the final split
    final_split = pd.concat([subset_1_sampled, subset_2_sampled], ignore_index=True)

    return final_split

train_dataset = split_dataframe_by_label(train_dataset, 'target', 0, 1, 6894)
print(train_dataset.shape)

## <font color="yellow"> SVM Model

In [None]:
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
Y = train_dataset['target']

# Standardize the features (mean=0 and variance=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train_dataset.drop(columns='target'))

# Create PCA object
pca = PCA(n_components=30)  # Specify the number of components (desired dimensionality)

# Fit PCA to the standardized data and transform the data
X_pca = pca.fit_transform(X_scaled)

X_train, X_test, Y_train, Y_test = train_test_split(X_pca,Y, test_size=0.1, stratify=Y, random_state=2)

print(X_train.shape, X_test.shape)

# classifier = svm.SVC(C=0.1 ,kernel='linear', gamma=0.001, class_weight="balanced")
classifier = svm.SVC(C=0.1, kernel='linear', gamma='scale', class_weight='balanced', verbose=True)

classifier.fit(X_train, Y_train)

In [None]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(test_data_accuracy)

#### F1 Score SVM 

In [None]:
from sklearn.metrics import f1_score

# Assuming classifier is your trained SVM model and X_test, y_test are your test data
y_pred = classifier.predict(X_test)

# Calculate F1 score
f1 = f1_score(Y_test, y_pred)

print("F1 Score:", f1)

## <font color="yellow"> Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Step 1: Split the dataset into features (X) and target variable (y)
X = train_dataset.drop(columns=['target'])  # Assuming 'target_column' is your target variable
y = train_dataset['target']

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 4: Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Step 5: Evaluate the classifier on the testing data
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

#### F1 Score

In [None]:
# Step 6: Compute the F1 score
f1 = f1_score(y_test, y_pred)
print(f1)

## <font color="yellow"> Gaussian Naive Bayes

In [None]:

from sklearn.naive_bayes import GaussianNB


# Split the dataset into features (X) and target variable (y)
X = train_dataset.drop(columns=['target'])  # Assuming 'target' is your target variable
y = train_dataset['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the Gaussian Naive Bayes classifier
gnb_classifier = GaussianNB()

# Train the classifier on the training data
gnb_classifier.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = gnb_classifier.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
from sklearn.model_selection import cross_val_score

# Create the Gaussian Naive Bayes classifier
gnb_classifier = GaussianNB()

# Perform 5-fold cross-validation
cv_scores = cross_val_score(gnb_classifier, X, y, cv=5)

# Print the cross-validation scores
print("Cross-validation Scores:", cv_scores)

# Calculate and print the mean cross-validation score
mean_cv_score = cv_scores.mean()
print("Mean Cross-validation Score:", mean_cv_score)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif

# Split the dataset into features (X) and target variable (y)
X = train_dataset.drop(columns=['target'])  # Assuming 'target' is your target variable
y = train_dataset['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Perform feature selection
selector = SelectKBest(score_func=f_classif, k=39)  # Select top 20 features
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Create the Gaussian Naive Bayes classifier
gnb_classifier = GaussianNB()

# Train the classifier on the selected features
gnb_classifier.fit(X_train_selected, y_train)

# Make predictions on the testing data
y_pred = gnb_classifier.predict(X_test_selected)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


#### F1 Score 

In [None]:
# Step 6: Compute the F1 score
f1 = f1_score(y_test, y_pred)
print(f1)

## <font color="yellow"> KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create KNN classifier
k = 5  # Number of neighbors
knn = KNeighborsClassifier(n_neighbors=k, weights='distance', algorithm='auto', metric='manhattan')

# Train the classifier
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


#### F1 Score 

In [None]:
# Step 6: Compute the F1 score
f1 = f1_score(y_test, y_pred)
print(f1)

## <font color="yellow"> Prima Neural Network 

In [None]:
# # Define the neural network architecture
# class NeuralNetwork(nn.Module):
#     def __init__(self, input_size):
#         super(NeuralNetwork, self).__init__()
#         self.fc1 = nn.Linear(input_size, 16)  
#         self.fc2 = nn.Linear(16,8)
#         self.relu = nn.ReLU()
#         self.fc3 = nn.Linear(8, 1)
#         self.dropout = nn.Dropout(p=0.2)
#         self.fc4 = nn.Linear(8, 1)  # Output layer with 1 neuron for binary classification

#     def forward(self, x):
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.dropout(x)
#         x = self.fc2(x)
#         x = self.relu(x)
#         x = self.dropout(x)
#         x = self.fc3(x)
#         x = self.relu(x)
#         #x = self.fc4(x)
#         return torch.sigmoid(x)



# accuracy_values = []
# loss_values = []
# X = train_dataset.iloc[:, :-1].to_numpy()
# y = train_dataset.iloc[:, -1].to_numpy()

# num_folds = 5
# input_size = 39
# num_epochs = 40
# num_models = 1

# kf = KFold(n_splits=num_folds, shuffle=True)

# criterion = nn.BCELoss() 
# l1_lambda = 0.01
# l2_lambda = 0.01
# fold_params = []

# for model_index in range(num_models):

#     for fold, (train_indices, val_indices) in enumerate(kf.split(X)):
#         print(f'Fold {fold+1}/{num_folds}')

#         #X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42)

#         # Split the data into training and validation sets
#         X_train, X_val = X[train_indices], X[val_indices]
#         y_train, y_val = y[train_indices], y[val_indices]

#         # Convert data to PyTorch tensors
#         X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
#         y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
#         X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
#         y_val_tensor = torch.tensor(y_val, dtype=torch.float32)
        
#         model = NeuralNetwork(input_size)
#         optimizer = optim.Adam(model.parameters(), lr=0.001)
#         # Train the neural network

#         for epoch in range(num_epochs):
#             # Forward pass
#             outputs = model(X_train_tensor)
#             loss = criterion(outputs, y_train_tensor.view(-1, 1))
#             loss_values.append(loss.item())

#             l1_reg = torch.tensor(0., requires_grad=True)
#             for param in model.parameters():
#                 l1_reg = l1_reg + torch.norm(param, p=1)
#             loss = loss + l1_lambda * l1_reg

#             # L2 regularization
#             l2_reg = torch.tensor(0., requires_grad=True)
#             for param in model.parameters():
#                 l2_reg = l2_reg + torch.norm(param, p=2)
#             loss = loss + l2_lambda * l2_reg
            
#             # Backward pass and optimization
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#         fold_params.append(model.state_dict())
#         print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Fold:{fold}')

#         # Evaluate the model
#         with torch.no_grad():
#             # Predict probabilities on the test set
#             outputs = model(X_val_tensor)
#             predicted = (outputs >= 0.5).float()
            
#             # Calculate accuracy
#             accuracy = (predicted == y_val_tensor.view(-1, 1)).float().mean()
#             accuracy_values.append(accuracy)
#             print(f'Accuracy on test set: {accuracy.item()*100:.2f}%')
#     torch.save(model.state_dict(), f'model_{model_index}.pth')

# avg_params = {}

# for key in fold_params[0].keys():
#     avg_params[key] = torch.stack([params[key] for params in fold_params]).mean(dim=0)

# # Create a new model with the average parameters
# average_model = NeuralNetwork(input_size)
# average_model.load_state_dict(avg_params)
# print(f'Averagea ccuracy on test set: {np.array(accuracy_values).mean()*100:.2f}%')


#### Plot Loss Chart

In [None]:
# import matplotlib.pyplot as plt

# # Plot the loss values
# plt.plot(loss_values, label='Training Loss')
# plt.xlabel('Epoch')
# plt.ylabel('Loss')
# plt.title('Training Loss Over Epochs')
# plt.legend()
# plt.grid(True)
# plt.show()

#### Calculated F1 score

In [None]:
# from sklearn.metrics import f1_score

# true_labels = y_val_tensor.numpy().astype(int)
# predicted_labels = 1-(predicted.numpy())
# # Calculate F1 score
# f1 = f1_score(true_labels, predicted_labels)
# print(f'F1 score on test set: {f1:.4f}')

## <font color="yellow"> Test Dataset

In [None]:
from sklearn.impute import SimpleImputer

#### Load Dataset

In [None]:
test_dataset = pd.read_csv("/Users/andreasalinetti/Documents/HACK4SDS/Dataset_DAY1/Data/test_set.csv", delimiter=';')


In [None]:
test_dataset.head()

#### Preprocess Test Dataset

In [None]:
# Drop columns 
test_dataset = Drop_unneed_columns(True,test_dataset)

test_dataset = test_dataset.drop(columns=drop_columns)

In [None]:

category_dics["juridical_form"]["SS"] = 15
category_dics["juridical_form"]["OS"] = 16


In [None]:
#print(category_dics["juridical_form"])
for k,v in category_dics.items():
    test_dataset.replace({k:v}, inplace=True)


In [None]:
c = test_dataset["external_score_ver03"].value_counts()
# print(c)

In [None]:
# find columns with MISSING values 
columns = []
for column in list(test_dataset.columns):
    # Check if there is a value "MISSING" in the 'column_name' column
    missing_values = test_dataset[column] == 'MISSING'

    # Check if any row contains the value "MISSING" in the specified column
    if missing_values.any():
        print(f"'MISSING' in the column: {column}")
        columns.append(column)
    

In [None]:
# Sum values in the specified columns
dic = {}
for column in columns:
    column_name = column

    count = 0
    sum_values = 0
    # Iterate over the DataFrame
    for index, row in test_dataset.iterrows():
        # Access the value of the specified column for each row
        count +=1
        if isinstance(row[column_name], str):
            continue
        elif isinstance(row[column_name], int):
            sum_values += row[column_name]
    
    dic[column] = int(sum_values/count)

print(dic)
       

In [None]:
def Replace_missing_test(dataset,val, column):
    # Assuming df is your DataFrame and 'column_to_change' is the column you want to change
    # 'condition_column' is the column based on which you want to change the content
    dataset.loc[(dataset[column] == 'MISSING'), column] = val


    # For example, if you want to change the content of 'column_to_change' to 'new_value' where 'condition_column' is True
    # Replace 'new_value', 'column_to_change', and 'condition_column' with your actual values
    return dataset

In [None]:
for k,v in dic.items():
    test_dataset = Replace_missing_test(test_dataset,v,k)

In [None]:
c = test_dataset["external_score_ver03"].value_counts()
# print(c)

In [None]:
test_dataset = Replace_bool_toNumbers(test_dataset)

In [None]:
pd.set_option('display.max_columns', None)
test_dataset.head()

In [None]:
## normalise test dataset 
def normalized_tdata(dataset):
    # Replace commas with periods in all columns
    dataset = dataset.replace(',', '.', regex=True)
    # print(dataset.dtypes)
    dataset = dataset.astype('float32')

    # check if the dataset has any nan value
    has_nan_values = dataset.isna().any().any()

    if has_nan_values:
        print("DataFrame contains NaN values.")
    else:
        print("DataFrame does not contain any NaN values.")

    return dataset
test_dataset = normalized_tdata(test_dataset)

In [None]:
test_dataset.head()

In [None]:
print(test_dataset.shape)

## <font color="yellow">SVM Predictions 

In [None]:
#Standardize the features (mean=0 and variance=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_dataset)

# Create PCA object
pca = PCA(n_components=30)  # Specify the number of components (desired dimensionality)

# Fit PCA to the standardized data and transform the data
X_pca = pca.fit_transform(X_scaled)

# X_train, X_test, Y_train, Y_test = train_test_split(X_pca,Y, test_size=0.1, stratify=Y, random_state=2)

# print(X_train.shape, X_test.shape)

# classifier = svm.SVC(C=0.1 ,kernel='linear', gamma=0.001, class_weight="balanced")
# classifier = svm.SVC(C=0.1, kernel='linear', gamma='scale', class_weight='balanced', verbose=True)



# Accuracy score on test data
X_test_prediction_final = classifier.predict(X_pca)
# test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(X_test_prediction_final)





In [None]:
import pandas as pd

# Assuming X_test_prediction_final contains your predictions
# Convert predictions to integers
X_test_prediction_final_int = X_test_prediction_final.astype(int)

# Create a DataFrame with the integer predictions
predictions_SVM = pd.DataFrame(X_test_prediction_final_int, columns=['label'])

# Write the DataFrame to a CSV file
predictions_SVM.to_csv('predictions_SVM.csv', index=False)


## <font color="yellow"> Random Forest Predictions

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_dataset)


# Accuracy score on test data
X_test_prediction_final = rf_classifier.predict(X_scaled)
# test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(X_test_prediction_final)

# Assuming X_test_prediction_final contains your predictions
# Convert predictions to integers
X_test_prediction_final_int = X_test_prediction_final.astype(int)

# Create a DataFrame with the integer predictions
predictions_RF = pd.DataFrame(X_test_prediction_final_int, columns=['label'])

# Write the DataFrame to a CSV file
predictions_RF.to_csv('predictions_rf.csv', index=False)

## <font color="yellow"> Gaussian Naive Bayes

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_dataset)


# Accuracy score on test data
X_test_prediction_final = gnb_classifier.predict(X_scaled)
# test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(X_test_prediction_final)

# Assuming X_test_prediction_final contains your predictions
# Convert predictions to integers
X_test_prediction_final_int = X_test_prediction_final.astype(int)

# Create a DataFrame with the integer predictions
predictions_GNB = pd.DataFrame(X_test_prediction_final_int, columns=['label'])

# Write the DataFrame to a CSV file
predictions_GNB.to_csv('predictions_gnb.csv', index=False)

## <font color="yellow"> KNN


In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_dataset)


# Accuracy score on test data
X_test_prediction_final = knn.predict(X_scaled)
# test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(X_test_prediction_final)

# Assuming X_test_prediction_final contains your predictions
# Convert predictions to integers
X_test_prediction_final_int = X_test_prediction_final.astype(int)

# Create a DataFrame with the integer predictions
predictions_KNN = pd.DataFrame(X_test_prediction_final_int, columns=['label'])

# Write the DataFrame to a CSV file
predictions_KNN.to_csv('predictions_knn.csv', index=False)

## <font color="yellow"> Voting Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(test_dataset)

# Accuracy score on test data
X_test_prediction_final = voting_classifier.predict(X_scaled)
# test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print(X_test_prediction_final)

# Assuming X_test_prediction_final contains your predictions
# Convert predictions to integers
X_test_prediction_final_int = X_test_prediction_final.astype(int)

# Create a DataFrame with the integer predictions
predictions_df = pd.DataFrame(X_test_prediction_final_int, columns=['label'])

# Write the DataFrame to a CSV file
predictions_df.to_csv('predictions_voting.csv', index=False)

In [329]:
import torch.nn.functional as F
import torch
weights = F.softmax(torch.tensor([3.0, 7.0, 8.0, 4.0]))
print(weights)

prediction_mean = ((0.18*(1-predictions_RF) + 0.28*predictions_KNN + 0.9*predictions_GNB + 0.4*(1-predictions_SVM))).astype(int)
prediction_mean.to_csv('predictions_voting2.csv', index=False)


tensor([0.0048, 0.2641, 0.7179, 0.0131])


  weights = F.softmax(torch.tensor([3.0,7.0,8.0,4.0]))


#### Export The CSV File