In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import KNNImputer
import numpy as np
import matplotlib.pyplot as plt



In [12]:
train_file_path = '../data/processed/train.csv'
test_file_path = '../data/processed/test.csv'
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)




In [None]:
print(train_df)

In [None]:
# remove unnessary features/columns (passID, Name, Ticket, Cabin, Survived)
removed_columns_train_df = train_df.drop(columns=['PassengerId','Name','Ticket','Cabin'])
removed_columns_test_df = test_df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

# remove unnessary features/columns from test.csv (passID, Name, Ticket, Cabin, Survived) 

# split data into the traget variable (survivied) and the independent variables
y = removed_columns_train_df.iloc[:,1] #target_value_survived 
X = removed_columns_train_df.iloc[:, removed_columns_train_df.columns != 'Survived'] # independent_variables/target


In [None]:
#convert categorical columns to numerical form using sklearn label encoder
#'Sex' column: male(1) female(0)

X.loc[:,'Sex'] = LabelEncoder().fit_transform(X['Sex'])

#fill missing values in 'Embarked' column with most common value
#first determine mode of column 
most_common_embarked_value = X['Embarked'].mode()[0]
print(f"most common embarked value is: {most_common_embarked_value}")

#fill missing values with mode from previous step
X.loc[:,'Embarked'] = X['Embarked'].fillna('S')

#convert 'Embarked' categorical values to numerical form (0,1,2)
X.loc[:,'Embarked'] = LabelEncoder().fit_transform(X['Embarked'])

# find the range of age in data
min_age = X['Age'].min()
max_age = X['Age'].max()
age_range = max_age - min_age

#print(age_range)
#print(independent_variables['Age'].describe())

print(X['Age'].median())






In [None]:
test_model = ExtraTreesClassifier()
test_model.fit(X,y)
print(test_model.feature_importances_)
feat_importances = pd.Series(test_model.feature_importances_, index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

In [None]:
#determine columns with missing values and the percentage of data that is missing
missing_value_precentage = removed_columns_df.isnull().mean() * 100
missing_value_precentage

In [6]:
# Determine how to handle missing age values (19.965% missing)
# make copys of removed_columns_df to test missing values

# TODO get distribution of ages after running the impuitations

deleted_rows_df = removed_columns_df.copy()
mean_imputed_df = removed_columns_df.copy()
median_imputed_df = removed_columns_df.copy()
mode_imputed_df = removed_columns_df.copy()
knn_imputed_df = removed_columns_df.copy()

#delete rows
deleted_rows_df = removed_columns_df.dropna(subset=['Age'])

#median
median_imputed_df['Age'] = removed_columns_df['Age'].fillna(removed_columns_df['Age'].median())

#mean
mean_imputed_df['Age'] = mean_imputed_df['Age'].fillna(mean_imputed_df['Age'].mean())

#mode
mode_imputed_df['Age'] = mode_imputed_df['Age'].fillna(mode_imputed_df['Age'].mode())

#KNN
KNNImputer = KNNImputer(n_neighbors=5)
knn_imputed_df[['Age']] = KNNImputer.fit_transform(knn_imputed_df[['Age']])




In [None]:
# calculate frequency of each age in each imputation

#original df
original_age_counts = removed_columns_df['Age'].describe()

#deleted age df
deleted_age_counts = deleted_rows_df['Age'].describe()

#median df
median_age_counts = median_imputed_df['Age'].describe()

#mean df
mean_age_counts = mean_imputed_df['Age'].describe()

#mode df
mode_age_counts = mode_imputed_df['Age'].describe()

#KNN df
KNN_age_counts = knn_imputed_df['Age'].describe()

print("Original age distribution(w/ missing values)")
print(original_age_counts)

print("\ndeleted age distribution")
print(deleted_age_counts)

print("\nmedian age distribution")
print(median_age_counts)

print("\nmean age distribution")
print(mean_age_counts)

print("\nmode age distribution")
print(mode_age_counts)

print("\nKNN age distribution")
print(KNN_age_counts)