# Practice 2: Cross Validation

**Ojeda Contreras Braulio Melquisedec**

**October 28th, 2022**

*Requirements:*

Using the weatherAUS.csv  dataset do the following:
1. Load the dataset into a pandas dataframe
2. Divide the dataset in training set (80%) and test set (20%) ensuring blending them
3. Using the training set create the next validation sets through cross validation
    - 3 folds
    - 5 folds
    - 10 folds
4. Create the needed classes to store the created data sets
5. Save in CSV files the data and their tags of each validation set.
    - data_validation_train_$<$total_folds$>$_$<$fold_number$>$.csv
    - target_validation_train_$<$total_folds$>$_$<$fold_number$>$.csv
    - data_test_$<$total_folds$>$_$<$fold_number$>$.csv
    - target_test_$<$total_folds$>$_$<$fold_number$>$.csv
6. Save in CSV files the data and their tags of test set:
    - data_test.csv
    - target_test.csv

In [1]:
# Importing needed libraries
import pandas as pd
import numpy as np
import pickle
import csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
# Defining data assignment functions
class validation_set:
    def __init__(self, X_train, y_train, X_test, y_test):
        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test

class test_set:
    def __init__(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test

class data_set:
    def __init__(self, validation_set, test_set):
        self.validation_set = validation_set
        self.test_set = test_set

In [3]:
# Defining function to save our distinct data in CSV files
def create_csv(file_name, data, col_names, list_opt = False):
    new_data = data.tolist()

    with open(file_name, 'w', newline='') as f:
        if list_opt:
            new_new_data = [[i] for i in new_data]
        else:
            new_new_data = new_data
        
        write = csv.writer(f)
        write.writerow(col_names)
        write.writerows(new_new_data)

In [4]:
# Defining function to get training and test set
def generate_train_test(file_name, tag_name):
    df = pd.read_csv(file_name, sep = ',', engine = 'python')
    X = df.drop(tag_name, axis = 1).values
    y = df[tag_name].values
    
    columns = list(df.columns.values)
    subcolumns = ','.join([column for column in columns if column != tag_name])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True)
    X_columns = subcolumns
    y_columns = tag_name
    return [X_train, y_train, X_test, y_test, X_columns, y_columns]

In [5]:
# Defining function which implements cross validation
def generate_folds(data, k):
    X_train = data[0]
    y_train = data[1]
    X_test = data[2]
    y_test = data[3]
    X_columns = data[4]
    y_columns = data[5]
    
    print('Cross Validation k =', k)
    validation_sets = []
    kf = KFold(n_splits = k)
    c = 0
    for train_index, test_index in kf.split(X_train):
        c = c + 1
        X_train_v, X_test_v = X_train[train_index], X_train[test_index]
        y_train_v, y_test_v = y_train[train_index], y_train[test_index]
        validation_sets.append(validation_set(X_train_v, y_train_v, X_test_v, y_test_v))
        
        create_csv(file_name = "./p2_weatherAUS/data_validation_train_" + str(k) + "_" + str(c) + ".csv", 
                    data = X_train_v, col_names = X_columns)
        
        create_csv(file_name = "./p2_weatherAUS/target_validation_train_" + str(k) + "_" + str(c) + ".csv", 
                    data = y_train_v, col_names = y_columns, list_opt = True)
        
        create_csv(file_name = "./p2_weatherAUS/data_test_" + str(k) + "_" + str(c) + ".csv", 
                    data = X_test_v, col_names = X_columns)
        
        create_csv(file_name = "./p2_weatherAUS/target_test_" + str(k) + "_" + str(c) + ".csv", 
                    data = y_test_v, col_names = y_columns, list_opt = True) 
    
    my_test_set = test_set(X_test, y_test)
    my_data_set = data_set(validation_sets, my_test_set)
    
    return (my_data_set)

In [6]:
# Getting training and test sets
data = generate_train_test('./weatherAUS.csv', 'RainTomorrow')

In [7]:
# Executing cross validation
ks = [3, 5, 10]
for k in ks:
    new_data = generate_folds(data, k)
    
    # Save dataset in pickle
    dataset_file = open('./p2_weatherAUS/dataset_f' + str(k) + '.pkl', 'wb')
    pickle.dump(new_data, dataset_file)
    dataset_file.close()
    print('Completed')

Cross Validation k = 3
Completed
Cross Validation k = 5
Completed
Cross Validation k = 10
Completed


In [8]:
# Saving data from test set
create_csv(file_name = "./p2_weatherAUS/data_test.csv", data = new_data.test_set.X_test, col_names = data[4])
create_csv(file_name = "./p2_weatherAUS/target_test.csv", data = new_data.test_set.y_test, col_names = data[5], list_opt = True)