# Project supervised learning - Drunk smurfs

Jean-Baptiste Maene - Denis Topallaj - Lander Pauwels Malengier 

## 0. Data-cleaning

In [103]:
# import dependencies

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')

  plt.style.use('seaborn-darkgrid')


In [104]:
# read the csv file

df = pd.read_csv('train_V2.csv')
df_test = pd.read_csv('score.csv')

## 0.1 Drop inconsistent or empty data

### 0.1.1 Dropping reoccurring empty rows

In [105]:
# when these three columns are not filled, the rest of the data is not filled.
# the rows without data are dropped.

indexes = df[(df['income_am'].isnull()) & (df['profit_last_am'].isnull()) & (df['profit_am'].isnull())].index

df = df.drop(index=indexes.array)

print('train', df.shape)

# same with the test file

indexes = df_test[(df_test['income_am'].isnull()) & (df_test['profit_last_am'].isnull()) & (df_test['profit_am'].isnull())].index

df_test = df_test.drop(index=indexes.array)

print('test', df_test.shape)


train (4947, 53)
test (496, 50)


### 0.1.2 Dropping duplicates

In [106]:
df.drop_duplicates()

df_test.drop_duplicates()

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,score1_pos,score1_neg,score2_pos,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg
0,5660.0,4320.0,8640.0,0.0,0.0,8000.0,0.0,0.0,1.0,0.0,...,0.538419,0.396819,0.423742,0.763608,,,,,,
1,3990.0,9.0,3450.0,0.0,0.0,12500.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
2,1158.0,82.0,4194.0,408.0,4.0,12000.0,0.0,0.0,0.0,1.0,...,0.009811,0.592842,,,0.252444,0.724693,0.818064,0.387361,,
3,2451.0,791.0,2119.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
4,946.0,222.0,2036.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,820.0,216.0,7794.0,1103.0,3.0,9000.0,0.0,0.0,0.0,1.0,...,0.307239,0.660891,,,0.738333,0.914151,,,0.262224,8.060677
496,6092.0,2100.0,3137.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.419981,0.668320,,,,,,
497,2301.0,214.0,2516.0,0.0,0.0,11000.0,0.0,0.0,0.0,0.0,...,0.837325,0.663044,0.697171,0.353229,,,,,,
498,492.0,0.0,3716.0,713.0,2.0,5000.0,0.0,0.0,0.0,1.0,...,,,,,,,,,0.461598,4.757132


### 0.1.3 Dropping outliers

In [107]:
# score5_neg has scores ranging from 7995 trillion to -472 trillion and everything in between. score5_neg and score5_pos will be dropped.

# looking at the score of the other four scoring systems we can confidently say that score5_neg can be dropped (score5_pos also has to be dropped, because they both make a pair)

# quantile score can only range 0 < q < 1

columns = ["score5_neg", "score5_pos"]

for col in columns:
    if col in df.columns:
        df.drop(col, axis=1, inplace=True)

# dropping the score5 for the test data set

for col in columns:
    if col in df_test.columns:
        df_test.drop(col, axis=1, inplace=True)

print("train", df.shape)
print("test", df_test.shape)

train (4947, 51)
test (496, 48)


### 0.1.4 Dropping unethical/'useless' columns

In [108]:
# Having gender, place or origin, race ... as a factor to disallow smurfs from entering a hotel is unethical.

if "urban_ic" in df.columns and "neighbor_income" in df.columns and "gender" in df.columns:
	df = df.drop('urban_ic', axis=1)
	df = df.drop('neighbor_income', axis=1)
	df = df.drop('gender', axis=1)
 
if "urban_ic" in df_test.columns and "neighbor_income" in df_test.columns and "gender" in df_test.columns:
	df_test = df_test.drop('urban_ic', axis=1)
	df_test = df_test.drop('neighbor_income', axis=1)
	df_test = df_test.drop('gender', axis=1)

print("train", df.shape)
print("test", df_test.shape)

train (4947, 48)
test (496, 45)


## 0.2 Handle missing data

In [109]:
# These are the columns with missing data

print("train\n", df.isnull().sum()[df.isnull().sum() != 0])

print("\ntest\n", df_test.isnull().sum()[df_test.isnull().sum() != 0])

train
 cab_requests      35
dining_ic         35
presidential      35
tenure_mts       339
tenure_yrs       339
shop_use          35
score1_pos      3722
score1_neg      3633
score2_pos      3738
score2_neg      3643
score3_pos      3686
score3_neg      3580
score4_pos      3724
score4_neg      3623
dtype: int64

test
 cab_requests      1
dining_ic         1
presidential      1
tenure_mts       35
tenure_yrs       35
shop_use          1
score1_pos      374
score1_neg      362
score2_pos      385
score2_neg      372
score3_pos      360
score3_neg      351
score4_pos      377
score4_neg      370
dtype: int64


### 0.2.1 Using mean for missing data

In [110]:
from sklearn.impute import SimpleImputer

df_mean_imputed = df.copy()

# mean worthy columns
columns = ["cab_requests", "dining_ic", "presidential", "shop_use"]

for col in columns:
	mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

	mean_imputer = mean_imputer.fit(np.array(df[col]).reshape(-1, 1))
 
	df_mean_imputed[col] = mean_imputer.transform(np.array(df[col]).reshape(-1, 1))

print("train\n", df_mean_imputed.isnull().sum()[df_mean_imputed.isnull().sum() != 0])


# using mean_imputer for the test data
df_mean_imputed_test = df_test.copy()

# mean worthy columns
columns = ["cab_requests", "dining_ic", "presidential", "shop_use"]

for col in columns:
	mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

	mean_imputer = mean_imputer.fit(np.array(df_test[col]).reshape(-1, 1))
 
	df_mean_imputed_test[col] = mean_imputer.transform(np.array(df_test[col]).reshape(-1, 1))

print("\ntest\n", df_mean_imputed_test.isnull().sum()[df_mean_imputed_test.isnull().sum() != 0])



train
 tenure_mts     339
tenure_yrs     339
score1_pos    3722
score1_neg    3633
score2_pos    3738
score2_neg    3643
score3_pos    3686
score3_neg    3580
score4_pos    3724
score4_neg    3623
dtype: int64

test
 tenure_mts     35
tenure_yrs     35
score1_pos    374
score1_neg    362
score2_pos    385
score2_neg    372
score3_pos    360
score3_neg    351
score4_pos    377
score4_neg    370
dtype: int64


### 0.2.2 Using KNN for missing data

In [111]:
# K-Nearest Neighbors (KNN) Imputation will be used for "tenure_mts" and "tenure_yrs" because it is a good option when the missingness is random, and there is no clear pattern to the missing data.

from sklearn.impute import KNNImputer

df_knn_imputed = df_mean_imputed.copy()

columns = ["tenure_mts", "tenure_yrs"]

for col in columns:
	knn_imputer = KNNImputer(n_neighbors=5)
	imputed_col = knn_imputer.fit_transform(df_knn_imputed[[col]])
	df_knn_imputed[col] = imputed_col

print("train\n", df_knn_imputed.isnull().sum()[df_knn_imputed.isnull().sum() != 0])


# using the KNNImputer for the test data set

df_knn_imputed_test = df_mean_imputed_test.copy()

columns = ["tenure_mts", "tenure_yrs"]

for col in columns:
	knn_imputer = KNNImputer(n_neighbors=5)
	imputed_col = knn_imputer.fit_transform(df_knn_imputed_test[[col]])
	df_knn_imputed_test[col] = imputed_col

print("\ntest\n", df_knn_imputed_test.isnull().sum()[df_knn_imputed_test.isnull().sum() != 0])

train
 score1_pos    3722
score1_neg    3633
score2_pos    3738
score2_neg    3643
score3_pos    3686
score3_neg    3580
score4_pos    3724
score4_neg    3623
dtype: int64

test
 score1_pos    374
score1_neg    362
score2_pos    385
score2_neg    372
score3_pos    360
score3_neg    351
score4_pos    377
score4_neg    370
dtype: int64


### 0.2.3 Handle missing data for scores

In [113]:
# We will be looping over each row and averaging the scores inside the pos and neg column.

# When there is no scores given by the staff, the score will default to 0.

# In the end the columns will be dropped and replaced by two new columns: "score_pos" and "score_neg".

# There are also checks added for when columns that need to be removed are already removed.

import numpy as np

def calculate_scores(df, columns):
	for i, row in df.iterrows():
		pos_sum = 0
		pos_count = 0
		neg_sum = 0
		neg_count = 0
		for col in columns:
			if "pos" in col:
				if not np.isnan(row[col]):
					pos_sum += row[col]
					pos_count += 1
			elif "neg" in col:
				if not np.isnan(row[col]):
					neg_sum += row[col]
					neg_count += 1
		if pos_count > 0:
			df.at[i, 'score_pos'] = pos_sum / pos_count
		if neg_count > 0:
			df.at[i, 'score_neg'] = neg_sum / neg_count

	df['score_pos'].fillna(0, inplace=True)
	df['score_neg'].fillna(0, inplace=True)
 
	df.drop(columns=columns, inplace=True)
	return df



In [115]:
# executing the function using the most recent dataFrame and the columns

columns=["score1_neg", "score1_pos", "score2_pos", "score2_neg", "score3_pos", "score3_neg", "score4_pos", "score4_neg"]

df_knn_imputed = calculate_scores(df_knn_imputed, columns=columns)


print("\ntrain\n", df_knn_imputed.isnull().sum()[df_knn_imputed.isnull().sum() != 0])

KeyError: 'score1_neg'

In [116]:
print("\ntrain\n", df_knn_imputed.isnull().sum()[df_knn_imputed.isnull().sum() != 0])


train
 Series([], dtype: int64)


## 0.3 Changing data types

### 0.3.1 changing floats to int

In [52]:
# These columns are not meant to be floats.
# ex.: You cannot have 0.214 of a child.
# The few that are floats are price or time related.

columns = ["damage_inc", "credit_use_ic", "gluten_ic", "lactose_ic", "insurance_ic", "spa_ic", "empl_ic", "cab_requests", "bar_no", "sport_ic", "age", "marketing_permit", "dining_ic", "presidential", "client_segment", "sect_empl", "prev_stay", "prev_all_in_stay", "divorce", "fam_adult_size", "children_no", "tenure_mts", "tenure_yrs", "company_ic", "claims_no", "nights_booked", "shop_use", "retired", "gold_status"]


for col in columns:
    if not df_knn_imputed[col].empty:
        df_knn_imputed[col] = df_knn_imputed[col].astype(np.uint8)
        
print("train\n", df_knn_imputed.info())

# change type to uint8 for the test data set

for col in columns:
    if not df_knn_imputed_test[col].empty:
        df_knn_imputed_test[col] = df_knn_imputed_test[col].astype(np.uint8)

print("\ntest\n", df_knn_imputed_test.info())

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
# Converting True to 1 and False to 0

def replace_true_false(dataframe, column_name):
    dataframe[column_name] = dataframe[column_name].replace({True: 1, False: 0})
    return dataframe

In [None]:
# Executing the function and replacing the new dataFrame

df_knn_imputed = replace_true_false(df_knn_imputed, 'married_cd')

## 0.4 Storing cleaned dataset

In [107]:
# the cleaned csv will now be in the file train_V2_cleaned and will be used for the rest of the calculations

df_knn_imputed.to_csv('train_V2_cleaned.csv', index=False)
df_knn_imputed.to_csv('score_cleaned.csv', index=False)