# Project supervised learning - Drunk smurfs

Jean-Baptiste Maene - Denis Topallaj - Lander Pauwels Malengier 

## 0. Data-cleaning

In [114]:
# import dependencies

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-darkgrid')

  plt.style.use('seaborn-darkgrid')


In [115]:
# read the csv file

df = pd.read_csv('train_V2.csv')

## 0.1 Drop inconsistent or empty data

### 0.1.1 Dropping reoccurring empty rows

In [116]:
# when these three columns are not filled, the rest of the data is not filled.
# the rows without data are dropped.

indexes = df[(df['income_am'].isnull()) & (df['profit_last_am'].isnull()) & (df['profit_am'].isnull())].index

df = df.drop(index=indexes.array)

df.shape

(4947, 53)

### 0.1.2 Dropping duplicates

In [117]:
df.drop_duplicates()

Unnamed: 0,income_am,profit_last_am,profit_am,damage_am,damage_inc,crd_lim_rec,credit_use_ic,gluten_ic,lactose_ic,insurance_ic,...,score2_neg,score3_pos,score3_neg,score4_pos,score4_neg,score5_pos,score5_neg,outcome_profit,outcome_damage_inc,outcome_damage_amount
0,227.0,0.0,3201.0,888.0,6.0,15000.0,0.0,0.0,0.0,0.0,...,,,,0.838147,0.082288,,,1791.66,0,0.00
1,268.0,16.0,1682.0,0.0,0.0,750.0,0.0,0.0,0.0,1.0,...,,,,,,,7.955259,1672.78,1,829.66
2,283.0,23.0,1673.0,0.0,0.0,750.0,0.0,0.0,0.0,1.0,...,0.099529,,,,,0.101955,1.743020,1001.40,0,0.00
3,227.0,0.0,1685.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,0.889793,,,,,1785.59,0,0.00
4,4091.0,1028.0,3425.0,785.0,2.0,14000.0,0.0,0.0,1.0,0.0,...,,0.330503,0.766294,0.490486,0.542445,,,3140.74,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,584.0,52.0,1769.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,0.745643,0.295942,0.830932,0.071366,0.313204,3.739346,2172.82,0,0.00
4996,227.0,0.0,1620.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,1057.83,0,0.00
4997,239.0,5.0,2068.0,0.0,0.0,2500.0,0.0,0.0,0.0,1.0,...,0.049782,,,0.544873,0.331139,,,188.77,0,0.00
4998,1068.0,104.0,6405.0,490.0,2.0,15000.0,0.0,0.0,0.0,1.0,...,,,,0.580246,0.388815,,,1863.41,0,0.00


### 0.1.3 Dropping outliers

In [118]:
# score5_neg has scores ranging from 7995 trillion to -472 trillion and everything in between. score5_neg and score5_pos will be dropped.

# looking at the score of the other four scoring systems we can confidently say that score5_neg can be dropped (score5_pos also has to be dropped, because they both make a pair)

# quantile score can only range 0 < q < 1

if "score5_pos" in df.columns and "score5_neg" in df.columns:
	df = df.drop('score5_pos', axis=1)
	df = df.drop('score5_neg', axis=1)

df.shape

(4947, 51)

### 0.1.4 Dropping unethical/'useless' columns

In [119]:
# Having gender, place or origin, race ... as a factor to disallow smurfs from entering a hotel is unethical.

if "urban_ic" in df.columns and "neighbor_income" in df.columns and "gender" in df.columns:
	df = df.drop('urban_ic', axis=1)
	df = df.drop('neighbor_income', axis=1)
	df = df.drop('gender', axis=1)

df.shape

(4947, 48)

### 0.2 Handle missing data

In [120]:
# These are the columns with missing data

df.isnull().sum()[df.isnull().sum() != 0]

cab_requests      35
dining_ic         35
presidential      35
tenure_mts       339
tenure_yrs       339
shop_use          35
score1_pos      3722
score1_neg      3633
score2_pos      3738
score2_neg      3643
score3_pos      3686
score3_neg      3580
score4_pos      3724
score4_neg      3623
dtype: int64

### 0.2.1 Using mean for missing data

In [121]:
from sklearn.impute import SimpleImputer

df_mean_imputed = df.copy()

# mean worthy columns
columns = ["cab_requests", "dining_ic", "presidential", "shop_use"]

for col in columns:
	mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

	mean_imputer = mean_imputer.fit(np.array(df[col]).reshape(-1, 1))
 
	df_mean_imputed[col] = mean_imputer.transform(np.array(df[col]).reshape(-1, 1))

df_mean_imputed.isnull().sum()[df_mean_imputed.isnull().sum() != 0]

tenure_mts     339
tenure_yrs     339
score1_pos    3722
score1_neg    3633
score2_pos    3738
score2_neg    3643
score3_pos    3686
score3_neg    3580
score4_pos    3724
score4_neg    3623
dtype: int64

### 0.2.2 Using KNN for missing data

In [122]:
# K-Nearest Neighbors (KNN) Imputation will be used for "tenure_mts" and "tenure_yrs" because it is a good option when the missingness is random, and there is no clear pattern to the missing data.

from sklearn.impute import KNNImputer

df_knn_imputed = df_mean_imputed.copy()

columns = ["tenure_mts", "tenure_yrs"]

for col in columns:
	knn_imputer = KNNImputer(n_neighbors=5)
	imputed_col = knn_imputer.fit_transform(df_knn_imputed[[col]])
	df_knn_imputed[col] = imputed_col

df_knn_imputed.isnull().sum()[df_knn_imputed.isnull().sum() != 0]

score1_pos    3722
score1_neg    3633
score2_pos    3738
score2_neg    3643
score3_pos    3686
score3_neg    3580
score4_pos    3724
score4_neg    3623
dtype: int64

### 0.2.3 Using --SOMETHING-- for missing data

FIND A WAY TO FILL IN THE SCORE DATA

## 0.3 Changing data types

### 0.3.1 changing floats to int

In [130]:
# These columns are not meant to be floats.
# ex.: You cannot have 0.214 of a child.
# The few that are floats are price related.

columns = ["damage_inc", "credit_use_ic", "gluten_ic", "lactose_ic", "insurance_ic", "spa_ic", "empl_ic", "cab_requests", "married_cd", "bar_no", "sport_ic", "age", "marketing_permit", "dining_ic", "presidential", "client_segment", "sect_empl", "prev_stay", "prev_all_in_stay", "divorce", "fam_adult_size", "children_no", "tenure_mts", "tenure_yrs", "company_ic", "claims_no", "nights_booked", "shop_use", "retired", "gold_status"]


for col in columns:
    if not df[col].empty:
        df_knn_imputed[col] = df_knn_imputed[col].astype(np.int64)

df_knn_imputed.info()

0       1
1       1
2       1
3       1
4       0
       ..
4995    1
4996    1
4997    0
4998    1
4999    0
Name: married_cd, Length: 4947, dtype: int64