# GGG Competition

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# read .csv files
train = pd.read_csv('./datasets/train.csv', index_col='id')
test = pd.read_csv('./datasets/test.csv', index_col='id')

# concatenate train and test sets
df = pd.concat([train, test])

# show the current number of creatures in the train set for each type
train['type'].value_counts()

## Data Preprocessing and Feature Engineering

In [None]:
# create search masks for every type of creature
Ghoul_mask = df['type'] == 'Ghoul'
Goblin_mask = df['type'] == 'Goblin'
Ghost_mask = df['type'] == 'Ghost'

In [None]:
# encode 'type' feature

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

types = le.classes_
types

In [None]:
# drop color as it is useless feature with random values
df.drop('color', axis=1, inplace=True)

# train features
columns = df.drop('type', axis=1).columns

df.head()

In [None]:
# if 'all_df' hasn't been defined yet, set its value to the 'df' 
try:
    all_df
except:
    all_df = df

# means and scatter-plots for each feature and each creature type
for col in columns:
    print(col)
    sorted_col_index = all_df[col].sort_values().index
    for typ in types:
        if len(np.where(types == typ)[0]) > 0:
            type_mask = (all_df['type'] == np.where(types == typ)[0][0])
            print(typ)
            print(all_df.loc[type_mask, col].mean())
    fig, ax = plt.subplots()   
    ax.scatter(all_df.loc[sorted_col_index, col], all_df.loc[sorted_col_index, 'type'])
    ax.set_title(col)
    print()

In [None]:
# redefine search masks according to the new encoding
Ghoul_mask = df['type'] == 1
Goblin_mask = df['type'] == 2
Ghost_mask = df['type'] == 0

# define the means and variances for each feature and for each creature

bone_ghost_mean = df.loc[Ghost_mask, 'bone_length'].mean()
bone_ghoul_mean = df.loc[Ghoul_mask, 'bone_length'].mean()
bone_goblin_mean = df.loc[Goblin_mask, 'bone_length'].mean()
bone_means = [bone_ghost_mean, bone_ghoul_mean, bone_goblin_mean]
bone_var = 1 - bone_ghoul_mean
# bone_var = bone_ghost_mean

hair_ghost_mean = df.loc[Ghost_mask, 'hair_length'].mean()
hair_ghoul_mean = df.loc[Ghoul_mask, 'hair_length'].mean()
hair_goblin_mean = df.loc[Goblin_mask, 'hair_length'].mean()
hair_means = [hair_ghost_mean, hair_ghoul_mean, hair_goblin_mean]
hair_var = hair_ghost_mean

soul_ghost_mean = df.loc[Ghost_mask, 'has_soul'].mean()
soul_ghoul_mean = df.loc[Ghoul_mask, 'has_soul'].mean()
soul_goblin_mean = df.loc[Goblin_mask, 'has_soul'].mean()
soul_means = [soul_ghost_mean, soul_ghoul_mean, soul_goblin_mean]
soul_var = 1 - soul_ghoul_mean
# soul_var = soul_ghost_mean

flesh_ghost_mean = df.loc[Ghost_mask, 'rotting_flesh'].mean()
flesh_ghoul_mean = df.loc[Ghoul_mask, 'rotting_flesh'].mean()
flesh_goblin_mean = df.loc[Goblin_mask, 'rotting_flesh'].mean()
flesh_means = [flesh_ghost_mean, flesh_ghoul_mean, flesh_goblin_mean]
flesh_var = flesh_goblin_mean

# pack them into ndarrays
means = np.array([bone_means, flesh_means, hair_means, soul_means])
var_s = np.array([bone_var, flesh_var, hair_var, soul_var])

# d_means = np.array([[-0.02, -0.02, 0.01], [-0.02, -0.01, 0.01], [0, 0, 0], [0, 0.01, 0.02]]) * (1)
# means += d_means

In [None]:
means

In [None]:
var_s

In [None]:
# table for manual typization of creatures
vote_table = np.zeros((df.shape[0], 3))

In [None]:
# redefine types according to the new encoding
typs = [0, 1, 2]

# for every creature in dataset try manually define its type
for i in range(df.shape[0]):
    creature = df.iloc[i]
    if creature['type'] == 3:
        for j in range(len(columns)):
            for typ in typs:
                if creature[columns[j]] >= means[j, typ] - var_s[j] and creature[columns[j]] <= means[j, typ] + var_s[j]:
                    vote_table[i, typ] += 1
        ind = np.argmax(vote_table[i])
        if vote_table[i].max() > 3 and np.where(vote_table[i] == vote_table[i, ind])[0].shape[0] == 1:
            df.iloc[i, -1] = ind

In [None]:
# show the current number of creatures in the dataset for each type
df['type'].value_counts()

# TRAIN-TEST Split

In [None]:
# define new train and test sets
test_df = df[(df['type'] == 3)].drop('type', axis=1)
train_df = df[(df['type'] != 3)]

# define train output
y_train = train_df['type']

# drop 'type' column in the train set
train_df.drop('type', axis=1, inplace=True)
train_df.head()

## Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
model = LogisticRegression()
# model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=2)
# model = KNeighborsClassifier(n_neighbors=61)
model.fit(train_df, y_train)

In [None]:
y_pred = model.predict(test_df)

In [None]:
test_df['type'] = y_pred

In [None]:
train_df['type'] = y_train

In [None]:
all_df = pd.concat([train_df, test_df])

In [None]:
submission_df = pd.read_csv('./datasets/sample_submission.csv', index_col='id')
prediction_types = types[all_df.loc[submission_df.index, 'type']]
submission_df['type'] = prediction_types

submission_df.to_csv('./results/LR.csv')

In [None]:
all_df['type'].value_counts()

# 3rd APPROACH
## Generate new samples

In [None]:
columns

In [None]:
N = 100

M = np.zeros((len(typs), len(columns) + 1, N))

for typ in typs:
    M[typ, len(columns)] = np.ones(N) * typ
    for i in range(len(columns)):
        lower = means[i, typ] - var_s[i]
        upper = means[i, typ] + var_s[i]
        for j in range(N):
            M[typ, i, j] = np.random.normal(means[i, typ], var_s[i] / 4)

In [None]:
# np.random.RandomState(42)
# np.random.RandomState(42).uniform(0, 1)

In [None]:
np.array([np.zeros(N), np.ones(N) * 1, np.ones(N) * 2])

In [None]:
M_RES = np.concatenate([M[0].T, M[1].T, M[2].T])

In [None]:
M_RES

In [None]:
all_df

In [None]:
indices = np.arange(len(typs) * N) + 2000

rand_df = pd.DataFrame(M_RES, columns=all_df.columns)
rand_df.set_index(indices, inplace=True)

df = pd.concat([df, rand_df])
df['type'] = df['type'].apply(lambda x: int(x))

In [None]:
df

In [None]:
K = 1000
arr = np.zeros(K)
for i in range(K):
    arr[i] = np.random.normal(0.34, 0.11)
print(arr.min(), arr.max())

# 2nd APPROACH 

# Hair

In [None]:
bone_ind = df['bone_length'].sort_values().index

In [None]:
plt.scatter(df.loc[bone_ind].index, df.loc[bone_ind]['rotting_flesh'])

In [None]:
df.loc[bone_ind, 'type'].head(225).value_counts()

In [None]:
min_Ghost_bone_length = df.loc[bone_ind, 'bone_length'].iloc[0]
mean_Ghost_bone_length = df.loc[bone_ind, 'bone_length'].iloc[225]
print('bone_length mean for Ghost')
print(mean_Ghost_bone_length)
print('varience =', mean_Ghost_bone_length - min_Ghost_bone_length)

In [None]:
bone_Ghost_mask = df['bone_length'] <= 2*mean_Ghost_bone_length

In [None]:
df.loc[bone_Ghost_mask, 'type'].value_counts()

# Hair

In [None]:
hair_ind = df['hair_length'].sort_values().index

In [None]:
df.loc[hair_ind, 'type'].head(50).value_counts()

In [None]:
df.loc[bone_ind, 'type'].head(40).value_counts()

# Soul

In [None]:
soul_ind = df['has_soul'].sort_values().index

In [None]:
df.loc[soul_ind, 'type'].head(50).value_counts()

# Flesh

In [None]:
flesh_ind = df['rotting_flesh'].sort_values().index

In [None]:
df.loc[flesh_ind, 'type'].head(50).value_counts()

# REVERSE

# Bone

In [None]:
rev_bone_ind = df['bone_length'].sort_values(ascending=False).index

In [None]:
df.loc[rev_bone_ind, 'type'].head(15).value_counts()

# Hair

In [None]:
rev_hair_ind = df['hair_length'].sort_values(ascending=False).index

In [None]:
df.loc[rev_hair_ind, 'type'].head(20).value_counts()

# Soul

In [None]:
rev_soul_ind = df['has_soul'].sort_values(ascending=False).index

In [None]:
df.loc[rev_soul_ind, 'type'].head(10).value_counts()

# Flesh

In [None]:
rev_flesh_ind = df['rotting_flesh'].sort_values(ascending=False).index

In [None]:
df.loc[rev_flesh_ind, 'type'].head(10).value_counts()

# Fill types

In [None]:
df.loc[bone_ind[:35], 'type'] = 'Ghost'
df.loc[hair_ind[:40], 'type'] = 'Ghost'
df.loc[soul_ind[:50], 'type'] = 'Ghost'
df.loc[rev_bone_ind[:15], 'type'] = 'Ghoul'
df.loc[rev_hair_ind[:20], 'type'] = 'Ghoul'
df.loc[rev_soul_ind[:10], 'type'] = 'Ghoul'
df.loc[rev_flesh_ind[:10], 'type'] = 'Ghost'

In [None]:
df['type'].value_counts()

# New train sample

In [None]:
train_df = df.loc[df['type'].notna()]

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

y = le.fit_transform(train_df['type'])
types = le.classes_

In [None]:
train_df.drop('type'], axis=1, inplace=True)

In [None]:
train_df.head()

In [None]:
test.head()

# Training simple model: logreg

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression()
clf.fit(train_df, y)

In [None]:
predictions = clf.predict(test_df)

In [None]:
submission_df = pd.read_csv('./datasets/sample_submission.csv', index_col='id')
submission_df['type'] = types[predictions]

submission_df.to_csv('./results/LogReg_NEW.csv')

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=1, random_state=2)
model.fit(train_df, y)
predictions = model.predict(test)

In [None]:
submission_df = pd.read_csv('./datasets/sample_submission.csv', index_col='id')
submission_df['type'] = types[predictions]

submission_df.to_csv('./results/RanFor_NEW.csv')