# Advanced Machine Learning
## Project 1 - Optimization Algorithms
### Authors: Bartosz Grabek, Izabela Telejko, Grzegorz Zbrzeżny

In [1]:
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn import preprocessing 
from statsmodels.stats.outliers_influence import variance_inflation_factor  
pd.set_option('display.max_columns', 500)

# Preprocessing

Function for calculating VIF (source: https://stats.stackexchange.com/questions/155028/how-to-systematically-remove-collinear-variables-pandas-columns-in-python)

In [2]:
def one_hot_encode(df):
    for column in df:
        if is_object_dtype(df[column]):
            dummies = pd.get_dummies(df[column], prefix=column)
            if np.sum(df[column].isna()) == 0:
                dummies = dummies.iloc[:, :-1]
            df = df = df.drop(column, axis = 1)
            df = df.join(dummies)
    return df

In [3]:
def calculate_vif(X, thresh=5.0):
    X = X.assign(const=1)  # faster than add_constant from statsmodels
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix) for ix in range(X.iloc[:, variables].shape[1])]
        vif = vif[:-1]  # don't let the constant be removed in the loop.
        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] + '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    print(X.columns[variables[:-1]])
    return X.iloc[:, variables[:-1]]

## Preprocessing functions

In [4]:
def preprocess_employee(filename):
    df = pd.read_csv(filename)
    df['EducationBachelors'] = 1 * (df['Education'] == 'Bachelors')
    df['EducationMasters'] = 1 * (df['Education'] == 'Masters')
    df['Gender'] = df['Gender'].map({'Female': 1, 'Male': 0})
    df['EverBenched'] = df['EverBenched'].map({'No': 0, 'Yes': 1})
    df.drop(['Education', 'City'], axis=1, inplace=True)
    X = calculate_vif(df.drop('LeaveOrNot', axis=1)).to_numpy()
    y = df['LeaveOrNot'].to_numpy()
    return X, y

def preprocess_challenger(filename):
    df = pd.read_csv(filename)
    df.drop('gameId', axis=1, inplace=True)
    for col in ['blue', 'red']:
        for lane in ['BOT_LANE', 'MID_LANE', 'TOP_LANE']:
            df[f'{col}FirstTowerLane_{lane}'] = df[f'{col}FirstTowerLane'].apply(lambda x: int(lane in x))
        for dragon in ['AIR_DRAGON', 'WATER_DRAGON', 'FIRE_DRAGON', 'EARTH_DRAGON']:
            df[f'{col}DragnoType_{dragon}'] = df[f'{col}DragnoType'].apply(lambda x: int(lane in x))
        df.drop(f'{col}FirstTowerLane', axis=1, inplace=True)
        df.drop(f'{col}DragnoType', axis=1, inplace=True)
    X = calculate_vif(df.drop('blueWins', axis=1)).to_numpy()
    y = df['blueWins'].to_numpy()
    return X, y

def preprocess_jungle(filename):
    df = arff.loadarff(filename)
    df = pd.DataFrame(df[0])
    str_df = df.select_dtypes([object])
    str_df = str_df.stack().str.decode('utf-8').unstack()
    for col in str_df:
        df[col] = str_df[col]
    df = df[df['class'] != 'd']
    df[['highest_strength', 'closest_to_den', 'fastest_to_den', 'class']] = df.copy()[['highest_strength', 'closest_to_den', 'fastest_to_den', 'class']].applymap(lambda x: int(x == 'w'))
    df = pd.concat([df, pd.get_dummies(df[['white_piece0_advanced', 'black_piece0_advanced']], drop_first=True)], axis=1)
    df.drop(['white_piece0_advanced', 'black_piece0_advanced'], axis=1, inplace=True)
    df = df.apply(pd.to_numeric)
    X = calculate_vif(df.drop('class', axis=1)).to_numpy()
    y = df['class'].to_numpy()
    return X, y

def preprocess_water(filename):
    water = pd.read_csv(filename)
    water["ammonia"] = water["ammonia"].replace("#NUM!", -100)
    water["ammonia"] = water["ammonia"].astype(float)
    water["ammonia"] = water["ammonia"].replace(-100, water.loc[water["ammonia"] != -100, "ammonia"].mean())
    y_water = water.is_safe.to_numpy()
    X_water = water.drop("is_safe", axis=1)
    X_water = calculate_vif_(X_water).to_numpy()
    return X_water, y_water

def preprocess_booking(filename):
    booking = pd.read_csv(filename).drop(["Booking_ID", "date of reservation"], axis=1)
    booking["market segment type"] = 1*(booking["market segment type"] == "Online")
    booking["booking status"] = 1*(booking["booking status"] == "Canceled")
    label_encoder = preprocessing.LabelEncoder() 
    booking["room type"] = label_encoder.fit_transform(booking["room type"]) 
    booking = one_hot_encode(booking)
    y_booking = booking["booking status"].to_numpy()
    X_booking = booking.drop("booking status", axis=1)
    X_booking = calculate_vif_(X_booking).to_numpy()
    return X_booking, y_booking

def preprocess_churn(filename):
    churn = pd.read_csv(filename)
    churn["FrequentFlyer"] = 1*(churn["FrequentFlyer"] == "Yes")
    churn["BookedHotelOrNot"] = 1*(churn["BookedHotelOrNot"] == "Yes")
    churn["AccountSyncedToSocialMedia"] = 1*(churn["AccountSyncedToSocialMedia"] == "Yes")
    churn.loc[churn["AnnualIncomeClass"] == "Low Income", "AnnualIncomeClass"] = 0
    churn.loc[churn["AnnualIncomeClass"] == "Middle Income", "AnnualIncomeClass"] = 1
    churn.loc[churn["AnnualIncomeClass"] == "High Income", "AnnualIncomeClass"] = 2
    churn.AnnualIncomeClass = churn.AnnualIncomeClass.astype(int)
    y_churn = churn.Target.to_numpy()
    X_churn = churn.drop("Target", axis=1)
    X_churn = calculate_vif_(X_churn).to_numpy()
    return X_churn, y_churn

In [18]:
X_employee, y_employee = preprocess_employee('data/Employee.csv')
X_challenger, y_challenger = preprocess_challenger('data/Challenger_LOL.csv')
X_jungle, y_jungle = preprocess_jungle('data/jungle_chess.arff')
X_water, y_water = preprocess_water("water_quality.csv")
X_booking, y_booking = preprocess_booking("booking.csv")
X_churn, y_churn = preprocess_churn("churn.csv")

Remaining variables:
Index(['JoiningYear', 'PaymentTier', 'Age', 'Gender', 'EverBenched',
       'ExperienceInCurrentDomain', 'EducationBachelors', 'EducationMasters'],
      dtype='object')


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


dropping 'blueTotalLevel' at index: 2
dropping 'blueKill' at index: 6
dropping 'blueDeath' at index: 6
dropping 'blueFirstTower' at index: 9
dropping 'blueTowerKills' at index: 10
dropping 'redTotalLevel' at index: 20
dropping 'redFirstTower' at index: 29
dropping 'redTowerKills' at index: 30
dropping 'blueTotalGolds' at index: 0
dropping 'redAvgLevel' at index: 19
dropping 'redTotalGolds' at index: 17
dropping 'redFirstDragon' at index: 31
dropping 'blueAssist' at index: 5
dropping 'redAssist' at index: 22
dropping 'blueFirstInhibitor' at index: 7
dropping 'redFirstInhibitor' at index: 23
dropping 'blueTotalMinionKills' at index: 2
dropping 'redDeath' at index: 19
Remaining variables:
Index(['blueCurrentGolds', 'blueAvgLevel', 'blueTotalJungleMinionKills',
       'blueFirstBlood', 'blueWardPlaced', 'blueWardKills',
       'blueMidTowerKills', 'blueTopTowerKills', 'blueBotTowerKills',
       'blueInhibitor', 'blueFirstDragon', 'blueDragon', 'blueRiftHeralds',
       'redWins', 'redCurr

  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss
  return 1 - self.ssr/self.centered_tss


dropping 'white_piece0_rank' at index: 2
dropping 'white_piece0_strength' at index: 0
dropping 'black_piece0_strength' at index: 15
dropping 'black_piece0_rank' at index: 16
dropping 'black_piece0_can_cross_shortest' at index: 25
dropping 'black_piece0_distanceto_white_den' at index: 16
dropping 'white_piece0_distanceto_black_den' at index: 2
dropping 'white_unopposed_to_den' at index: 33
dropping 'white_piece0_distanceto_white_den' at index: 1
dropping 'black_piece0_distanceto_black_den' at index: 14
dropping 'black_unopposed_to_den' at index: 31
dropping 'white_piece0_can_cross_shortest' at index: 8
dropping 'black_piece0_movesto_white_den' at index: 13
dropping 'white_piece0_nextto_black_piece0' at index: 11
dropping 'white_piece0_movesto_black_den' at index: 4
dropping 'white_piece0_advanced_DEF' at index: 29
dropping 'black_piece0_advanced_DEF' at index: 30
Remaining variables:
Index(['white_piece0_file', 'white_piece0_unopposedto_black_den_length',
       'white_piece0_unopposedt