In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import math

"""ACTIVITY 1"""

# --- Load data ---
df = pd.read_csv('train.csv')

# --- LINTING 1 --- Drop unecessary data
df.drop('Name', 1, inplace=True)
df.drop('Cabin', 1, inplace=True)
df.drop('PassengerId', 1, inplace=True)

# --- LINTING 2 --- Replace nullish Age values
df['Age'].interpolate(inplace=True)

# --- LINTING 3 --- Drop leftovers wih null
df.dropna(inplace=True)

# --- LINTING 4 --- Replace literals with numeric values
df['Sex'], unique_sex_keys = pd.factorize(df['Sex'])
df['Ticket'], unique_ticket_keys = pd.factorize(df['Ticket'])
#df['Embarked'], unique_embarked_keys = pd.factorize(df['Embarked'])
df = pd.get_dummies(df, columns=['Embarked'])


# --- LINTING 5 --- Normalize values
df.loc[:, ~df.columns.isin(['Survived', 'Embarked'])] = StandardScaler().fit_transform(
    df.loc[:, ~df.columns.isin(['Survived', 'Embarked'])]
)

# --- Head visualization ---
df.head()
df.describe()
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    889 non-null    int64  
 1   Pclass      889 non-null    float64
 2   Sex         889 non-null    float64
 3   Age         889 non-null    float64
 4   SibSp       889 non-null    float64
 5   Parch       889 non-null    float64
 6   Ticket      889 non-null    float64
 7   Fare        889 non-null    float64
 8   Embarked_C  889 non-null    float64
 9   Embarked_Q  889 non-null    float64
 10  Embarked_S  889 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 83.3 KB


In [3]:
""" ACTIVITY 2"""
def get_euclidian_distance(X_train_row, X_test_row):
    for i, val in enumerate(X_test_row):
        X_train_row[i] = (X_test_row[i] - X_train_row[i]) ** 2
    return X_train_row

class MyKNeighborsClassifier():
    def __init__(self, k=5):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train.copy()
        self.y_train = y_train.copy()
    
    def predict(self, X_test):
        prediction = pd.DataFrame(index=X_test.index.copy())
        prediction['Survived'] = [None] * len(prediction)
        
        for i, X_test_row in X_test.iterrows():
            # Calculate euclidian
            X_train_copy = X_train.copy()
            y_train_copy = y_train.copy()
            
            X_train_copy.apply(
                lambda X_train_row: get_euclidian_distance(X_train_row, X_test_row),
                axis=1
            )
            y_train_copy['Distance'] = (
                 X_train_copy['Pclass']   
                 #+ X_train_copy['Sex']   
                 + X_train_copy['Age']  
                 + X_train_copy['SibSp'] 
                 + X_train_copy['Parch']
                 + X_train_copy['Ticket']
                 + X_train_copy['Fare']
                 + X_train_copy['Embarked_C']
                 + X_train_copy['Embarked_Q']
                 + X_train_copy['Embarked_S']
            ).apply(math.sqrt)
            
            # Predict
            aux = y_train_copy.nsmallest(
                self.k,
                ['Distance'],
                keep='first'
            )
            prediction['Survived'][i] = aux['Survived'].value_counts()[:1].index.tolist()[0]
        return prediction
            
                    

# Split dataset
X = df.iloc[:, 1:11] # Training data
y = pd.DataFrame(df.iloc[:, 0]) # Answer

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
my_classifier = MyKNeighborsClassifier()
my_classifier.fit(X_train, y_train)
y_pred = my_classifier.predict(X_test)

# Proximo bloco verifica o resultado!!!!

In [152]:
""" ACTIVITY 2: Comparation """
y_pred_copy = y_pred.copy() 
y_test_copy = y_test.copy() 


y_pred_copy['Survived'] = pd.to_numeric(y_pred_copy['Survived'], downcast="float")
y_test_copy['Survived'] = pd.to_numeric(y_test_copy['Survived'], downcast="float")

wrong = len(y_pred_copy['Survived'].compare(y_test_copy['Survived']))
total = len(y_test)

success = (total - wrong) / total
print(success * 100)


77.52808988764045
