In [None]:
import os

def scale_input_data(scale_factor):
  file_bases = ['./input/train', './input/test']
  for file_base in file_bases:
    import pandas as pd
    import shutil
    if scale_factor == 1.0:
      shutil.copyfile(file_base + '.csv', file_base + '.scaled.csv')
      continue
    df_to_scale = pd.read_csv(file_base + '.csv')
    new_num_rows = int(scale_factor * len(df_to_scale))
    if scale_factor <= 1.0:
      df_to_scale = df_to_scale.iloc[:new_num_rows]
    else:
      while len(df_to_scale) < new_num_rows:
        df_to_scale = pd.concat([df_to_scale, df_to_scale[:min(new_num_rows - len(df_to_scale), len(df_to_scale))]])
    df_to_scale.to_csv(file_base + '.scaled.csv', index=False)

if 'INPUT_SCALE_FACTOR' in os.environ:
  scale_input_data(float(os.environ['INPUT_SCALE_FACTOR']))

After some playing around with the Titanic dataset and getting introduced to machine learning and Kaggle, this is my first notebook on Kaggle. In this notebook I attempt to tackle the monsters classification problem through a simple logistic regression model.

## Importing libraries ##

In [1]:
# import pandas as pd # data processing, CSV file I/O
exec(os.environ['IREWR_IMPORTS'])
# ALEX: remove plotting
# import seaborn as sns # plotting
# from sklearn.linear_model import LogisticRegression # Logistic regression

## Load datasets from CSV files and check for missing values ##

In [2]:
train = pd.read_csv("./input/train.scaled.csv")
test = pd.read_csv("./input/test.scaled.csv")

train.isnull().any()

id               False
bone_length      False
rotting_flesh    False
hair_length      False
has_soul         False
color            False
type             False
dtype: bool

## Explore dataset ##

In [3]:
train.head()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,4,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,5,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [4]:
train.describe()

Unnamed: 0,id,bone_length,rotting_flesh,hair_length,has_soul
count,371.0,371.0,371.0,371.0,371.0
mean,443.67655,0.43416,0.506848,0.529114,0.471392
std,263.222489,0.132833,0.146358,0.169902,0.176129
min,0.0,0.061032,0.095687,0.1346,0.009402
25%,205.5,0.340006,0.414812,0.407428,0.348002
50%,458.0,0.434891,0.501552,0.538642,0.466372
75%,678.5,0.517223,0.603977,0.647244,0.60061
max,897.0,0.817001,0.932466,1.0,0.935721


## Explore categorical variables with Seaborn ##

In [5]:
# ALEX: remove plotting
# ax = sns.countplot(x='type', data=train, palette='Set3')

In [6]:
# ALEX: remove plotting
# ax = sns.countplot(x='color', data=train, palette='Set3')

## Drop 'id' column and make categorical variables numerical ##
This way we can make a sensible pair plot and correlation matrix to see potential relationships between variables.

In [7]:
id_list = list(train['id']) # Create list of 'id' column in case we need it later
# ALEX: make notebook run
# train = train.drop('id', 1) # Drop 'id' column
train = train.drop('id', axis=1) # Drop 'id' column

# Create dictionaries for 'type' and 'color' variables

type_dict = {'Ghoul': 0,
            'Goblin': 1,
            'Ghost': 2}

color_dict = {'clear': 0,
             'green': 1,
             'black': 2,
             'white': 3,
             'blue': 4,
             'blood': 5}

# Use dictionaries to re-map values categorical variables

train['type'] = train['type'].map(type_dict).astype(float)
train['color'] = train['color'].map(color_dict).astype(float)

test['color'] = test['color'].map(color_dict).astype(float)

In [8]:
# ALEX: remove plotting
# sns.pairplot(train, hue='type', palette='Set3') 

#Legend label text shows (0, 1, 2); any tips on how to change legend text are more than welcome!

In [9]:
train.corr(method='pearson')

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
bone_length,1.0,-0.041716,0.353881,0.381675,0.018126,-0.520687
rotting_flesh,-0.041716,1.0,-0.220353,-0.132051,0.118533,0.278228
hair_length,0.353881,-0.220353,1.0,0.474835,-0.123635,-0.67464
has_soul,0.381675,-0.132051,0.474835,1.0,-0.007005,-0.649989
color,0.018126,0.118533,-0.123635,-0.007005,1.0,0.034793
type,-0.520687,0.278228,-0.67464,-0.649989,0.034793,1.0


'Bone length', 'hair length' and 'soul' all have a strong negative relationship with 'type of monster'. 'Hair length' and 'soul' also reveal to have to fairly strong relationship. 'Color' has a very weak relationship with 'type of monster'. Let's make 'color' binary (did this mainly for practice; not sure about the added value).

In [10]:
train = pd.concat([train, pd.get_dummies(train['color'], prefix = 'color')], axis=1) # Create dummies
# ALEX: make notebook run
# train = train.drop('color', 1) # Drop 'color' column
train = train.drop('color', axis=1) # Drop 'color' column

test = pd.concat([test, pd.get_dummies(test['color'], prefix = 'color')], axis=1) # Create dummies
# ALEX: make notebook run
# test = test.drop('color', 1) # Drop 'color' column
test = test.drop('color', axis=1) # Drop 'color' column

## Select features ##

In [11]:
# List of columns we are using in the model

feature_cols = ['bone_length', 
                'rotting_flesh', 
                'hair_length', 
                'has_soul', 
                'color_0.0',
                'color_1.0',
                'color_2.0',
                'color_3.0',
                'color_4.0',
                'color_5.0']

## Fit logistic regression model ##

In [12]:
X = train.loc[:, feature_cols] # Set independent variables
y = train.type # Set outcome variable

# ALEX: remove ML code
# logreg = LogisticRegression()
# logreg.fit(X, y) # Fit model

## Make prediction ##

In [13]:
X_test = test.loc[:, feature_cols]
# ALEX: remove ML code
# new_type_pred = logreg.predict(X_test) # Use fitted model to predict outcome in test df
new_type_pred = train.type

## Create submission file ##

In [14]:
# Create submission df

submission = pd.DataFrame({'id': test.id, 'type': new_type_pred})

# Convert 'type' variable back to string variable

type_dict_sub = {0: 'Ghoul',
            1: 'Goblin',
            2: 'Ghost'}

submission['type'] = submission['type'].map(type_dict_sub).astype(object)

# Write submission file to CSV

submission.to_csv('submission.csv', index=False)

This particular submission gave me a score of 0.71645. Any tips on how to improve this model's accuracy are more than welcome. I'm hoping to learn as much as I can by continuing to improve my prediction. 