# Project supervised learning - Drunk smurfs

Jean-Baptiste Maene - Denis Topallaj - Lander Pauwels Malengier 

## 0. Data-cleaning

In [1]:
# import dependencies

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')

  plt.style.use('seaborn-darkgrid')


In [2]:
# read the csv file

df = pd.read_csv('train_V2.csv')
df_test = pd.read_csv('score.csv')

## 0.1 Drop inconsistent or empty data

### 0.1.1 Dropping reoccurring empty rows

In [3]:
# when these three columns are not filled, the rest of the data is not filled.
# the rows without data are dropped.

indexes = df[(df['income_am'].isnull()) & (df['profit_last_am'].isnull()) & (df['profit_am'].isnull())].index

df = df.drop(index=indexes.array)

print('train', df.shape)

# same with the test file

indexes = df_test[(df_test['income_am'].isnull()) & (df_test['profit_last_am'].isnull()) & (df_test['profit_am'].isnull())].index

df_test = df_test.drop(index=indexes.array)

print('test', df_test.shape)


train (4947, 53)
test (496, 50)


### 0.1.2 Dropping duplicates

In [4]:
df.drop_duplicates()

df_test.drop_duplicates()

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,score1_pos,score1_neg,score2_pos,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg
0,5660.0,4320.0,8640.0,0.0,0.0,8000.0,0.0,0.0,1.0,0.0,...,0.538419,0.396819,0.423742,0.763608,,,,,,
1,3990.0,9.0,3450.0,0.0,0.0,12500.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
2,1158.0,82.0,4194.0,408.0,4.0,12000.0,0.0,0.0,0.0,1.0,...,0.009811,0.592842,,,0.252444,0.724693,0.818064,0.387361,,
3,2451.0,791.0,2119.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
4,946.0,222.0,2036.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,820.0,216.0,7794.0,1103.0,3.0,9000.0,0.0,0.0,0.0,1.0,...,0.307239,0.660891,,,0.738333,0.914151,,,0.262224,8.060677
496,6092.0,2100.0,3137.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.419981,0.668320,,,,,,
497,2301.0,214.0,2516.0,0.0,0.0,11000.0,0.0,0.0,0.0,0.0,...,0.837325,0.663044,0.697171,0.353229,,,,,,
498,492.0,0.0,3716.0,713.0,2.0,5000.0,0.0,0.0,0.0,1.0,...,,,,,,,,,0.461598,4.757132


### 0.1.3 Dropping outliers

In [5]:
# score5_neg has scores ranging from 7995 trillion to -472 trillion and everything in between. score5_neg and score5_pos will be dropped.

# looking at the score of the other four scoring systems we can confidently say that score5_neg can be dropped (score5_pos also has to be dropped, because they both make a pair)

# quantile score can only range 0 < q < 1

columns = ["score5_neg", "score5_pos"]

for col in columns:
    if col in df.columns:
        df.drop(col, axis=1, inplace=True)

# dropping the score5 for the test data set

for col in columns:
    if col in df_test.columns:
        df_test.drop(col, axis=1, inplace=True)

print("train", df.shape)
print("test", df_test.shape)

train (4947, 51)
test (496, 48)


### 0.1.4 Dropping unethical/'useless' columns

In [6]:
# Having gender, place or origin, race ... as a factor to disallow smurfs from entering a hotel is unethical.

drop_columns = ["urban_ic", "gluten_ic", "lactose_ic"]

for col in drop_columns:
	if col in df.columns:
		df = df.drop(col, axis=1)

for col in drop_columns:
	if col in df_test.columns:
		df_test = df_test.drop(col, axis=1)

print("train", df.shape)
print("test", df_test.shape)

train (4947, 48)
test (496, 45)


## 0.2 Handle missing data

In [7]:
# These are the columns with missing data

print("train\n", df.isnull().sum()[df.isnull().sum() != 0])

print("\ntest\n", df_test.isnull().sum()[df_test.isnull().sum() != 0])

train
 cab_requests         35
neighbor_income     186
dining_ic            35
presidential         35
tenure_mts          339
tenure_yrs          339
shop_use             35
score1_pos         3722
score1_neg         3633
score2_pos         3738
score2_neg         3643
score3_pos         3686
score3_neg         3580
score4_pos         3724
score4_neg         3623
dtype: int64

test
 cab_requests         1
neighbor_income     17
dining_ic            1
presidential         1
tenure_mts          35
tenure_yrs          35
shop_use             1
score1_pos         374
score1_neg         362
score2_pos         385
score2_neg         372
score3_pos         360
score3_neg         351
score4_pos         377
score4_neg         370
dtype: int64


### 0.2.1 Using mean for missing data

In [8]:
from sklearn.impute import SimpleImputer

df_mean_imputed = df.copy()

# mean worthy columns
columns = ["cab_requests", "dining_ic", "presidential", "shop_use", "neighbor_income"]

for col in columns:
	mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

	mean_imputer = mean_imputer.fit(np.array(df[col]).reshape(-1, 1))
 
	df_mean_imputed[col] = mean_imputer.transform(np.array(df[col]).reshape(-1, 1))

print("train\n", df_mean_imputed.isnull().sum()[df_mean_imputed.isnull().sum() != 0])


# using mean_imputer for the test data
df_mean_imputed_test = df_test.copy()

# mean worthy columns
columns = ["cab_requests", "dining_ic", "presidential", "shop_use"]

for col in columns:
	mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

	mean_imputer = mean_imputer.fit(np.array(df_test[col]).reshape(-1, 1))
 
	df_mean_imputed_test[col] = mean_imputer.transform(np.array(df_test[col]).reshape(-1, 1))

print("\ntest\n", df_mean_imputed_test.isnull().sum()[df_mean_imputed_test.isnull().sum() != 0])



train
 tenure_mts     339
tenure_yrs     339
score1_pos    3722
score1_neg    3633
score2_pos    3738
score2_neg    3643
score3_pos    3686
score3_neg    3580
score4_pos    3724
score4_neg    3623
dtype: int64

test
 neighbor_income     17
tenure_mts          35
tenure_yrs          35
score1_pos         374
score1_neg         362
score2_pos         385
score2_neg         372
score3_pos         360
score3_neg         351
score4_pos         377
score4_neg         370
dtype: int64


### 0.2.2 Using KNN for missing data

In [9]:
# K-Nearest Neighbors (KNN) Imputation will be used for "tenure_mts" and "tenure_yrs" because it is a good option when the missingness is random, and there is no clear pattern to the missing data.

from sklearn.impute import KNNImputer

df_knn_imputed = df_mean_imputed.copy()

columns = ["tenure_mts", "tenure_yrs"]

for col in columns:
	knn_imputer = KNNImputer(n_neighbors=5)
	imputed_col = knn_imputer.fit_transform(df_knn_imputed[[col]])
	df_knn_imputed[col] = imputed_col

print("train\n", df_knn_imputed.isnull().sum()[df_knn_imputed.isnull().sum() != 0])


# using the KNNImputer for the test data set

df_knn_imputed_test = df_mean_imputed_test.copy()

columns = ["tenure_mts", "tenure_yrs"]

for col in columns:
	knn_imputer = KNNImputer(n_neighbors=5)
	imputed_col = knn_imputer.fit_transform(df_knn_imputed_test[[col]])
	df_knn_imputed_test[col] = imputed_col

print("\ntest\n", df_knn_imputed_test.isnull().sum()[df_knn_imputed_test.isnull().sum() != 0])

train
 score1_pos    3722
score1_neg    3633
score2_pos    3738
score2_neg    3643
score3_pos    3686
score3_neg    3580
score4_pos    3724
score4_neg    3623
dtype: int64

test
 neighbor_income     17
score1_pos         374
score1_neg         362
score2_pos         385
score2_neg         372
score3_pos         360
score3_neg         351
score4_pos         377
score4_neg         370
dtype: int64


### 0.2.3 Handle missing data for scores

In [10]:
# We will be looping over each row and averaging the scores inside the pos and neg column.

# When there is no scores given by the staff, the score will default to 0.

# In the end the columns will be dropped and replaced by two new columns: "score_pos" and "score_neg".

# There are also checks added for when columns that need to be removed are already removed.

import numpy as np

def calculate_scores(df, columns):
	for i, row in df.iterrows():
		pos_sum = 0
		pos_count = 0
		neg_sum = 0
		neg_count = 0
		for col in columns:
			if col not in df.columns:
				break
			if "pos" in col:
				if not np.isnan(row[col]):
					pos_sum += row[col]
					pos_count += 1
			elif "neg" in col:
				if not np.isnan(row[col]):
					neg_sum += row[col]
					neg_count += 1
		if pos_count > 0:
			df.at[i, 'score_pos'] = pos_sum / pos_count
		if neg_count > 0:
			df.at[i, 'score_neg'] = neg_sum / neg_count

	df['score_pos'].fillna(0, inplace=True)
	df['score_neg'].fillna(0, inplace=True)
	
	return df



In [11]:
# columns that need to be altered/removed
columns=["score1_neg", "score1_pos", "score2_pos", "score2_neg", "score3_pos", "score3_neg", "score4_pos", "score4_neg"]

# create new columns that contain the avg of each row
df_knn_imputed = calculate_scores(df_knn_imputed, columns=columns)
df_knn_imputed_test = calculate_scores(df_knn_imputed_test, columns=columns)

# Drop the unneeded columns from the dataset
df_knn_imputed.drop(columns=[col for col in columns if col in df_knn_imputed], inplace=True)

df_knn_imputed_test.drop(columns=[col for col in columns if col in df_knn_imputed_test], inplace=True)

# show the amount of null fields
print("\ntrain\n", df_knn_imputed.isnull().sum()[df_knn_imputed.isnull().sum() != 0])
print("\ntest\n", df_knn_imputed_test.isnull().sum()[df_knn_imputed_test.isnull().sum() != 0])

df_knn_imputed.head()


train
 Series([], dtype: int64)

test
 neighbor_income    17
dtype: int64


Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,insurance_ic,spa_ic,empl_ic,...,gender,shop_am,shop_use,retired,gold_status,outcome_profit,outcome_damage_inc,outcome_damage_amount,score_pos,score_neg
0,227.0,0.0,3201.0,888.0,6.0,15000.0,0.0,0.0,1.0,0.0,...,M,0.0,0.0,0.0,0.0,1791.66,0,0.0,0.652958,0.532814
1,268.0,16.0,1682.0,0.0,0.0,750.0,0.0,1.0,1.0,0.0,...,M,0.0,0.0,0.0,0.0,1672.78,1,829.66,0.0,0.0
2,283.0,23.0,1673.0,0.0,0.0,750.0,0.0,1.0,0.0,0.0,...,M,0.0,0.0,0.0,0.0,1001.4,0,0.0,0.232375,0.099529
3,227.0,0.0,1685.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,V,0.0,0.0,0.0,0.0,1785.59,0,0.0,0.0,0.889793
4,4091.0,1028.0,3425.0,785.0,2.0,14000.0,0.0,0.0,1.0,0.0,...,V,1454.210627,1.0,0.0,0.0,3140.74,0,0.0,0.410495,0.65437


## 0.3 Changing data types

### 0.3.1 changing floats to int

In [12]:
# These columns are not meant to be floats.
# ex.: You cannot have 0.214 of a child.
# The few that are floats are price or time related.
# 

columns = ["damage_inc", "credit_use_ic", "insurance_ic", "spa_ic", "empl_ic", "cab_requests", "bar_no", "sport_ic", "age", "marketing_permit", "dining_ic", "presidential", "client_segment", "sect_empl", "prev_stay", "prev_all_in_stay", "divorce", "fam_adult_size", "children_no", "tenure_mts", "tenure_yrs", "company_ic", "claims_no", "nights_booked", "shop_use", "retired", "gold_status"]


for col in columns:
    if not df_knn_imputed[col].empty:
        df_knn_imputed[col] = df_knn_imputed[col].astype(np.uint8)

print("train\n", df_knn_imputed.info())

# change type to uint8 for the test data set

for col in columns:
    if not df_knn_imputed_test[col].empty:
        df_knn_imputed_test[col] = df_knn_imputed_test[col].astype(np.uint8)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4947 entries, 0 to 4999
Data columns (total 42 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   income_am              4947 non-null   float64
 1   profit_last_am         4947 non-null   float64
 2   profit_am              4947 non-null   float64
 3   damage_am              4947 non-null   float64
 4   damage_inc             4947 non-null   uint8  
 5   crd_lim_rec            4947 non-null   float64
 6   credit_use_ic          4947 non-null   uint8  
 7   insurance_ic           4947 non-null   uint8  
 8   spa_ic                 4947 non-null   uint8  
 9   empl_ic                4947 non-null   uint8  
 10  cab_requests           4947 non-null   uint8  
 11  married_cd             4947 non-null   bool   
 12  bar_no                 4947 non-null   uint8  
 13  sport_ic               4947 non-null   uint8  
 14  neighbor_income        4947 non-null   float64
 15  age 

### 0.3.2 Changing Boolean into Binary

In [13]:
# Converting True to 1 and False to 0

df_knn_imputed["married_cd"] = df_knn_imputed["married_cd"].replace({True: 1, False: 0})
df_knn_imputed_test["married_cd"] = df_knn_imputed_test["married_cd"].replace({True: 1, False: 0})


### 0.3.3 Changing the Gender to binary

In [14]:
# Check if the column gender hasn't already been dropped
if "gender" in df_knn_imputed.columns and "gender" in df_knn_imputed_test.columns:
    
	# If gender contained 'M' -> add 1 to the 'male' column, else 0
	df_knn_imputed['male'] = df_knn_imputed['gender'].apply(lambda x: 1 if x == 'M' else 0)

	# If gender contained 'V' -> add 1 to the 'female' column, else 0
	df_knn_imputed['female'] = df_knn_imputed['gender'].apply(lambda x: 1 if x == 'V' else 0)
		
	# Drop the original 'gender' column
	df_knn_imputed.drop('gender', axis=1, inplace=True)

	# Same for the test data set

	df_knn_imputed_test['male'] = df_knn_imputed_test['gender'].apply(lambda x: 1 if x == 'M' else 0)
	df_knn_imputed_test['female'] = df_knn_imputed_test['gender'].apply(lambda x: 1 if x == 'V' else 0)
	df_knn_imputed_test.drop('gender', axis=1, inplace=True)
 
	# set the type to category
	df_knn_imputed['male'] = df_knn_imputed['male'].astype('category')
	df_knn_imputed['female'] = df_knn_imputed['female'].astype('category')
 
	df_knn_imputed_test['male'] = df_knn_imputed_test['male'].astype('category')
	df_knn_imputed_test['female'] = df_knn_imputed_test['female'].astype('category')
 

In [15]:
df_knn_imputed.head()

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,insurance_ic,spa_ic,empl_ic,...,shop_use,retired,gold_status,outcome_profit,outcome_damage_inc,outcome_damage_amount,score_pos,score_neg,male,female
0,227.0,0.0,3201.0,888.0,6,15000.0,0,0,1,0,...,0,0,0,1791.66,0,0.0,0.652958,0.532814,1,0
1,268.0,16.0,1682.0,0.0,0,750.0,0,1,1,0,...,0,0,0,1672.78,1,829.66,0.0,0.0,1,0
2,283.0,23.0,1673.0,0.0,0,750.0,0,1,0,0,...,0,0,0,1001.4,0,0.0,0.232375,0.099529,1,0
3,227.0,0.0,1685.0,0.0,0,0.0,0,0,0,0,...,0,0,0,1785.59,0,0.0,0.0,0.889793,0,1
4,4091.0,1028.0,3425.0,785.0,2,14000.0,0,0,1,0,...,1,0,0,3140.74,0,0.0,0.410495,0.65437,0,1


### 3.3.4 Splitting the categorical data to two binary columns

There are a few columns that have to be converted to categorial types. Columns such as "retired". This indicates if a person is retired or not. At this point it is only shown by 0's and 1's. But the AI doesn't know if the 0 means retired or not.

With this categorial types are used to split the 0's and 1's in their respective columns. The function below this text will convert the given original column to two new columns with their respective values (given by us). It will after also convert the type into categorial. At the end, it will drop the original column.

In [16]:
def split_column(df, original_column, new_column1, new_column2):
	if original_column not in df.columns:
		return df
	# create two new columns with the same number of rows as the original column
	df[new_column1] = pd.Series(index=df.index)
	df[new_column2] = pd.Series(index=df.index)
	
	# replace the values in the new columns based on the values in the original column    
	df.loc[df[original_column] == 1, new_column1] = 1
	df.loc[df[original_column] == 0, new_column1] = 0
	df.loc[df[original_column] == 1, new_column2] = 0
	df.loc[df[original_column] == 0, new_column2] = 1
	

	# make categorial
	df[new_column1] = df[new_column1].astype('category')
	df[new_column2] = df[new_column2].astype('category')

	# drop the original column
	df.drop(columns=[original_column], inplace=True)
	
	return df

In [17]:
# adding "yes" and "no" with columns that would otherwise have the same names. ex.: divorce cannot be no_divorce and divorce, because divorce is the original column. So this will be no_divorce and yes_divorce. (Otherwise you'll get empty columns)

df_knn_imputed = split_column(df_knn_imputed, "credit_use_ic", "cash/debitcard_use", "creditcard_use")
df_knn_imputed = split_column(df_knn_imputed, "insurance_ic", "not_insured", "insured")
df_knn_imputed = split_column(df_knn_imputed, "spa_ic", "no_spa", "spa")
df_knn_imputed = split_column(df_knn_imputed, "empl_ic", "no_empl", "empl")
df_knn_imputed = split_column(df_knn_imputed, "sport_ic", "no_sport", "sport")
df_knn_imputed = split_column(df_knn_imputed, "marketing_permit", "no_marketing_perm", "marketing_perm")
df_knn_imputed = split_column(df_knn_imputed, "dining_ic", "no_dining", "dining")
df_knn_imputed = split_column(df_knn_imputed, "presidential", "no_presidential", "yes_presidential")
df_knn_imputed = split_column(df_knn_imputed, "prev_stay", "no_prev_stay", "perv_stay")
df_knn_imputed = split_column(df_knn_imputed, "prev_all_in_stay", "no_prev_all_in_stay", "yes_prev_all_in_stay")
df_knn_imputed = split_column(df_knn_imputed, "divorce", "no_divorce", "yes_divorce")
df_knn_imputed = split_column(df_knn_imputed, "company_ic", "no_company_card", "company_card")
df_knn_imputed = split_column(df_knn_imputed, "shop_use", "no_shop_use", "yes_shop_use")
df_knn_imputed = split_column(df_knn_imputed, "retired", "not_retired", "yes_retired")
df_knn_imputed = split_column(df_knn_imputed, "gold_status", "no_gold_status", "yes_gold_status")

print(df_knn_imputed.isnull().sum()[df_knn_imputed.isnull().sum() != 0])
# TEST

df_knn_imputed_test = split_column(df_knn_imputed_test, "credit_use_ic", "cash/debitcard_use", "creditcard_use")
df_knn_imputed_test = split_column(df_knn_imputed_test, "insurance_ic", "not_insured", "insured")
df_knn_imputed_test = split_column(df_knn_imputed_test, "spa_ic", "no_spa", "spa")
df_knn_imputed_test = split_column(df_knn_imputed_test, "empl_ic", "no_empl", "empl")
df_knn_imputed_test = split_column(df_knn_imputed_test, "sport_ic", "no_sport", "sport")
df_knn_imputed_test = split_column(df_knn_imputed_test, "marketing_permit", "no_marketing_perm", "marketing_perm")
df_knn_imputed_test = split_column(df_knn_imputed_test, "dining_ic", "no_dining", "dining")
df_knn_imputed_test = split_column(df_knn_imputed_test, "presidential", "no_presidential", "yes_presidential")
df_knn_imputed_test = split_column(df_knn_imputed_test, "prev_stay", "no_prev_stay", "perv_stay")
df_knn_imputed_test = split_column(df_knn_imputed_test, "prev_all_in_stay", "no_prev_all_in_stay", "yes_prev_all_in_stay")
df_knn_imputed_test = split_column(df_knn_imputed_test, "divorce", "no_divorce", "yes_divorce")
df_knn_imputed_test = split_column(df_knn_imputed_test, "company_ic", "no_company_card", "company_card")
df_knn_imputed_test = split_column(df_knn_imputed_test, "shop_use", "no_shop_use", "yes_shop_use")
df_knn_imputed_test = split_column(df_knn_imputed_test, "retired", "not_retired", "yes_retired")
df_knn_imputed_test = split_column(df_knn_imputed_test, "gold_status", "no_gold_status", "yes_gold_status")

print(df_knn_imputed_test.isnull().sum()[df_knn_imputed_test.isnull().sum() != 0])

print(df_knn_imputed.info())

Series([], dtype: int64)
neighbor_income    17
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4947 entries, 0 to 4999
Data columns (total 58 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   income_am              4947 non-null   float64 
 1   profit_last_am         4947 non-null   float64 
 2   profit_am              4947 non-null   float64 
 3   damage_am              4947 non-null   float64 
 4   damage_inc             4947 non-null   uint8   
 5   crd_lim_rec            4947 non-null   float64 
 6   cab_requests           4947 non-null   uint8   
 7   married_cd             4947 non-null   int64   
 8   bar_no                 4947 non-null   uint8   
 9   neighbor_income        4947 non-null   float64 
 10  age                    4947 non-null   uint8   
 11  client_segment         4947 non-null   uint8   
 12  sect_empl              4947 non-null   uint8   
 13  fam_adult_size         4947 non-n

  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(index=df.index)
  df[new_column1] = pd.Series(index=df.index)
  df[new_column2] = pd.Series(inde

In [18]:
df_knn_imputed.head()

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,cab_requests,married_cd,bar_no,neighbor_income,...,no_divorce,yes_divorce,no_company_card,company_card,no_shop_use,yes_shop_use,not_retired,yes_retired,no_gold_status,yes_gold_status
0,227.0,0.0,3201.0,888.0,6,15000.0,3,1,2,28936.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,268.0,16.0,1682.0,0.0,0,750.0,7,1,3,16674.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,283.0,23.0,1673.0,0.0,0,750.0,1,1,4,32552.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
3,227.0,0.0,1685.0,0.0,0,0.0,6,1,8,32252.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
4,4091.0,1028.0,3425.0,785.0,2,14000.0,4,0,2,29605.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0


## 0.4 Storing cleaned dataset

In [19]:
# the cleaned csv will now be in the file train_V2_cleaned and will be used for the rest of the calculations

df_knn_imputed.to_csv('train_V2_cleaned.csv', index=False)
df_knn_imputed.to_csv('score_cleaned.csv', index=False)