# CIHW-2 Alireza Rashidi

## Preparing Dataset files:

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!cp /content/gdrive/MyDrive/Datasets/Dataset.zip /content/sample_data

In [3]:
!mkdir /content/CIHW2_Datasets
!unzip /content/sample_data/Dataset.zip -d /content/CIHW2_Datasets

Archive:  /content/sample_data/Dataset.zip
  inflating: /content/CIHW2_Datasets/test.csv  
  inflating: /content/CIHW2_Datasets/train.csv  


## Understanding of Dataset:

In [5]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
import math

In [6]:
train_df = pd.read_csv('/content/CIHW2_Datasets/train.csv')
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [7]:
print(train_df.shape)

(891, 12)


In [8]:
print('shape of dataset: ', train_df.shape)
print('\ncol names: ', train_df.columns)
print('\nnumber of columns: ', len(train_df.columns))
print('\ncolomns data types:\n', train_df.dtypes)
# print('\nmissing values:\n', train_df.isnull().sum())
print(" \nCount total missing values in a DataFrame: ", train_df.isnull().sum().sum())

shape of dataset:  (891, 12)

col names:  Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

number of columns:  12

colomns data types:
 PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
 
Count total missing values in a DataFrame:  866


### Seeing the number of missing values in percent:

In [9]:
total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total missing values', 'In percent %'])
missing_data

Unnamed: 0,Total missing values,In percent %
Cabin,687,77.1
Age,177,19.9
Embarked,2,0.2
Fare,0,0.0
Ticket,0,0.0
Parch,0,0.0
SibSp,0,0.0
Sex,0,0.0
Name,0,0.0
Pclass,0,0.0


In [10]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Data Preprocessing:

### Handling Missing data and data cleaning:

In [11]:
"""At first, we delete cabin column becuase it has so much missing data and we can't do any thing to fix this problem, e.g: data imputation. 
   Also we delete passenger's ID from dataset beacuse it cuases the model to overfit due to uniqueness of this columns's values and it does not contribute to a persons survival probability.
   Name column is also like passenger's ID.
"""
train_df = train_df.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis = 1)
print('shape of dataset: ', train_df.shape)
print('\ncol names: ', train_df.columns)

shape of dataset:  (891, 8)

col names:  Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')


### Handling Age column missing data and catagorize it's values:

In [12]:
"""About Age feature, we use data imputation beacuse we have some information about this column. """
train_df['Age'] = train_df['Age'].replace(np.NaN, train_df['Age'].mean() )
train_df['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [13]:
# creating categories for age column.
train_df.loc[ train_df['Age'] <= 19, 'Age'] = 0
train_df.loc[(train_df['Age'] > 19) & (train_df['Age'] <= 25), 'Age'] = 1
train_df.loc[(train_df['Age'] > 25) & (train_df['Age'] <= 31.8), 'Age'] = 2
train_df.loc[(train_df['Age'] > 31.8) & (train_df['Age'] <= 41), 'Age'] = 3
train_df.loc[train_df['Age'] > 41, 'Age'] = 4
    
train_df['Age']

0      1.0
1      3.0
2      2.0
3      3.0
4      3.0
      ... 
886    2.0
887    0.0
888    2.0
889    2.0
890    3.0
Name: Age, Length: 891, dtype: float64

In [14]:
# we will print number of samples per each catagory.
train_df['Age'].value_counts()

2.0    304
0.0    164
3.0    144
4.0    142
1.0    137
Name: Age, dtype: int64

#### Handeling Embarked colomn missing data: 

In [15]:
train_df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [16]:
""" due to freaquency of 'S' in this column and becasue we have 2 missing data, we replace them with most freaquent one."""
train_df['Embarked'] = train_df['Embarked'].fillna('S')       # fillna fills null or missing values with specific value.
train_df['Embarked'].describe()

count     891
unique      3
top         S
freq      646
Name: Embarked, dtype: object

### Encoding and converting catagorical values to numerical values:

Usually Algorithems Work better with numerical values, so we must convert this catagorical values to numerical values to achive maximum accuracy

In [17]:
Embarked_encodings = {'S': 0, 'C': 1, 'Q': 2}
gender_encodings = {'male' : 0,'female' : 1}
train_df['Embarked'] = train_df['Embarked'].map(Embarked_encodings)
train_df['Sex'] = train_df['Sex'].map(gender_encodings)

As shown in below statement, we converted all of colomns to numrical data that we can use them to train models and fixed missing data problem:

In [18]:
print('\ncolomns data types:\n', train_df.dtypes)
print('\nmissing values:\n', train_df.isnull().sum())


colomns data types:
 Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked      int64
dtype: object

missing values:
 Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


## Spliting features and labels from dataset:

In [19]:
train_y_df = train_df.iloc[:, 1]   # all rows from col number 1.
train_X_df = train_df.drop(['Survived'], axis = 1)  # spliting Survived column from features dataset.
print(train_X_df.shape, train_y_df.shape)

(891, 7) (891,)


In [20]:
train_X, validation_X, train_y, validation_y = train_test_split(train_X_df.to_numpy(), train_y_df.to_numpy(), train_size=0.85, random_state=2021)
print('Sample train features: \n', train_X[:3])
print('\nSample train labels: ', train_y[:3])

Sample train features: 
 [[  1.       1.       4.       1.       1.     164.8667   0.    ]
 [  2.       0.       1.       1.       2.      41.5792   1.    ]
 [  3.       0.       2.       0.       0.       8.05     0.    ]]

Sample train labels:  [1 2 3]


In [21]:
print(train_X.shape)
print(validation_X.shape)
print(train_y.shape)
print(validation_y.shape)

(757, 7)
(134, 7)
(757,)
(134,)


## k-Nearest Neighbors implementation:


In [22]:
class KNN_Classifier:

    def __init__(self, train_X, train_y, K=5):

        self.K = K
        self.train_X = train_X
        self.train_y = train_y

    #calculates euclidean_distance between 2 datapoints.
    def eucledian(self, point1, point2):
        return np.sqrt(np.sum((point1 - point2) ** 2))

    def get_neighbors(self, current_datapoint):
        dist_from_neighbors = []
        for neighbor_idx, train_row in enumerate (self.train_X):
            # calcualating the eucledian error from current data point to one of the datapoints in train dataset.
            dist = self.eucledian(current_datapoint, train_row)
            dist_from_neighbors.append({'neighbor_index': neighbor_idx, 'dist_value':dist})
        # sorting dist_from_neighbors according to the eucledian error.
        dist_from_neighbors.sort(key=lambda dists: dists['dist_value'])   # key - function that serves as a key for the sort comparison.
        # returning the k-nearest neighbors of current_datapoint using hyperparammeter K. 
        return dist_from_neighbors[:self.K]

    def predict(self, datapoint_X):
        neighbors = self.get_neighbors(datapoint_X)
        # extracting neighbors indexes and mapping them to their corresponding labels in train_y. 
        neighbor_labels = self.train_y[[element['neighbor_index'] for element in neighbors]]
        neighbor_labels = neighbor_labels.astype(np.int32)
         #Majority voting (counting frequent label in k nearest neighbors)
        prediction = np.bincount(neighbor_labels).argmax()
        return prediction

    # this function runs algo on the set of input data(validation or test data).
    def k_nearest_neighbors_run(self, validation_data):
        predictions = []
        for datapoint in validation_data:
            output = self.predict(datapoint)
            predictions.append(output)
        return np.array(predictions)

    # this function calcualtes score(accuracy) of the model.  
    def accuracy_metric(self, actual, predicted):
        correct_count = np.unique((actual == predicted), return_counts=True)[1][1]
        return correct_count / float(actual.shape[0]) * 100.0
    

### KNN Evaluation:

In [23]:
knn_model = KNN_Classifier(train_X, train_y, K=3)
print('some prediction on training data:')
for sample_idx in range(25):
    prediction = knn_model.predict(train_X[sample_idx])
    print('Actual label: {},  Predicted label: {}'.format(train_y[sample_idx], prediction))

some prediction on training data:
Actual label: 1,  Predicted label: 1
Actual label: 2,  Predicted label: 2
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 2
Actual label: 3,  Predicted label: 3
Actual label: 1,  Predicted label: 1
Actual label: 1,  Predicted label: 1
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 3
Actual label: 1,  Predicted label: 1
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 3
Actual label: 1,  Predicted label: 1
Actual label: 2,  Predicted label: 2
Actual label: 1,  Predicted label: 1
Actual label: 1,  Predicted label: 1
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 1
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 3
Actual label: 3,  Predicted label: 3
Actual label: 1,  Predicted label: 1


In [24]:
predictions = knn_model.k_nearest_neighbors_run(train_X)
print('Accuracy on train data: ', knn_model.accuracy_metric(train_y, predictions))

Accuracy on train data:  97.88639365918098


In [25]:
predictions = knn_model.k_nearest_neighbors_run(validation_X)
print('Actual validation labels: \n', validation_y[:10])
print('\nPredicted validation labels: \n', predictions[:10])

print('\nAccuracy on validation data: ', knn_model.accuracy_metric(validation_y, predictions))

Actual validation labels: 
 [3 3 2 3 3 3 1 3 2 3]

Predicted validation labels: 
 [3 3 2 3 2 3 1 3 2 3]

Accuracy on validation data:  95.52238805970148


In [26]:
print('confusion matrix for validation data KNN model: \n', confusion_matrix(validation_y, predictions))
print('\n', classification_report(validation_y, predictions))

confusion matrix for validation data KNN model: 
 [[24  0  0]
 [ 1 25  2]
 [ 0  3 79]]

               precision    recall  f1-score   support

           1       0.96      1.00      0.98        24
           2       0.89      0.89      0.89        28
           3       0.98      0.96      0.97        82

    accuracy                           0.96       134
   macro avg       0.94      0.95      0.95       134
weighted avg       0.96      0.96      0.96       134



## SVM implementation:

### implemented using linear kernel:

In [27]:
from sklearn import svm

# training svm model on training data (with linear kernel.)
svm_model = svm.SVC(kernel='linear')
svm_model.fit(train_X, train_y)     # fit function trains model (updates model's weights)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
y_pred_validation = svm_model.predict(validation_X)
print('Actual validation data:\n',validation_y[:10])
print('\nPredicted on validation data:\n', y_pred_validation[:10])
print("\nAccuracy on validation data:",metrics.accuracy_score(validation_y, y_pred_validation) * 100)

Actual validation data:
 [3 3 2 3 3 3 1 3 2 3]

Predicted on validation data:
 [3 3 2 3 3 3 1 3 2 3]

Accuracy on validation data: 100.0


In [29]:
print('confusion matrix for validation data: \n', confusion_matrix(validation_y, y_pred_validation))
print('\n', classification_report(validation_y, y_pred_validation))

confusion matrix for validation data: 
 [[24  0  0]
 [ 0 28  0]
 [ 0  0 82]]

               precision    recall  f1-score   support

           1       1.00      1.00      1.00        24
           2       1.00      1.00      1.00        28
           3       1.00      1.00      1.00        82

    accuracy                           1.00       134
   macro avg       1.00      1.00      1.00       134
weighted avg       1.00      1.00      1.00       134



As we can see the score of validation data is 1 (100 percent) for SVM model and is little bit tricky so we can use cross validation to check the performance of alogritem in detail.

In [30]:
cv = cross_val_score(svm_model,validation_X, validation_y, cv=5) # 5 fold cross validation on SVM with linear kernel. 
print(cv)
print('\ncross validation mean score: ', cv.mean())

[1.         1.         0.88888889 1.         1.        ]

cross validation mean score:  0.9777777777777779


### Implemented using RBF kernel:

In [31]:
# training svm model on training data (with RBF kernel.)
svm_model_RBF = svm.SVC(kernel='rbf')
svm_model_RBF.fit(train_X, train_y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [32]:
y_pred_validation = svm_model_RBF.predict(validation_X)
print('Actual validation data:\n',validation_y[:10])
print('\nPredicted on validation data:\n', y_pred_validation[:10])
print("\nAccuracy on validation data:",metrics.accuracy_score(validation_y, y_pred_validation) * 100)

Actual validation data:
 [3 3 2 3 3 3 1 3 2 3]

Predicted on validation data:
 [3 3 3 3 3 3 1 3 3 3]

Accuracy on validation data: 70.8955223880597


In [33]:
print('confusion matrix for validation data: \n', confusion_matrix(validation_y, y_pred_validation))
print('\n', classification_report(validation_y, y_pred_validation))

confusion matrix for validation data: 
 [[19  0  5]
 [ 2  0 26]
 [ 6  0 76]]

               precision    recall  f1-score   support

           1       0.70      0.79      0.75        24
           2       0.00      0.00      0.00        28
           3       0.71      0.93      0.80        82

    accuracy                           0.71       134
   macro avg       0.47      0.57      0.52       134
weighted avg       0.56      0.71      0.63       134



  _warn_prf(average, modifier, msg_start, len(result))


## Evaluation of trained models on test data:

first we must load test data and apply preprocesses on it.

In [34]:
test_df = pd.read_csv('/content/CIHW2_Datasets/test.csv')
print(test_df.shape)

(418, 11)


In [35]:
test_df = test_df.drop(['Cabin', 'PassengerId', 'Name', 'Ticket'], axis = 1)
test_df['Age'] = test_df['Age'].replace(np.NaN, test_df['Age'].mean() )
test_df['Fare'] = test_df['Fare'].replace(np.NaN, test_df['Fare'].mean() )
test_df['Embarked'] = test_df['Embarked'].fillna('S')

test_df['Embarked'] = test_df['Embarked'].map(Embarked_encodings)
test_df['Sex'] = test_df['Sex'].map(gender_encodings) 

test_df.loc[ test_df['Age'] <= 19, 'Age'] = 0
test_df.loc[(test_df['Age'] > 19) & (test_df['Age'] <= 25), 'Age'] = 1
test_df.loc[(test_df['Age'] > 25) & (test_df['Age'] <= 31.8), 'Age'] = 2
test_df.loc[(test_df['Age'] > 31.8) & (test_df['Age'] <= 41), 'Age'] = 3
test_df.loc[test_df['Age'] > 41, 'Age'] = 4

print('shape of preprocessed dataset: ', test_df.shape)
test_df.head(10)

shape of preprocessed dataset:  (418, 7)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,0,3.0,0,0,7.8292,2
1,3,1,4.0,1,0,7.0,0
2,2,0,4.0,0,0,9.6875,2
3,3,0,2.0,0,0,8.6625,0
4,3,1,1.0,1,1,12.2875,0
5,3,0,0.0,0,0,9.225,0
6,3,1,2.0,0,0,7.6292,2
7,2,0,2.0,1,1,29.0,0
8,3,1,0.0,0,0,7.2292,1
9,3,0,1.0,2,0,24.15,0


In [36]:
print('\nmissing values:\n', train_df.isnull().sum())


missing values:
 Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [37]:
test_X = test_df.to_numpy()
print(test_X[:5])

[[ 3.      0.      3.      0.      0.      7.8292  2.    ]
 [ 3.      1.      4.      1.      0.      7.      0.    ]
 [ 2.      0.      4.      0.      0.      9.6875  2.    ]
 [ 3.      0.      2.      0.      0.      8.6625  0.    ]
 [ 3.      1.      1.      1.      1.     12.2875  0.    ]]


### Evaluation of KNN and SVM models using test data:

In [38]:
test_predictions = knn_model.k_nearest_neighbors_run(test_X)
print('\nPredicted test data labels (KNN Model): \n', test_predictions[:25])


Predicted test data labels (KNN Model): 
 [3 3 2 3 3 3 3 2 3 3 3 1 1 2 1 2 2 3 3 3 1 3 1 1 1]


In [39]:
y_pred_test = svm_model.predict(test_X)
print('\nPredicted test data labels (SVM with linear kernel): \n', y_pred_test[:25])


Predicted test data labels (SVM with linear kernel): 
 [3 3 2 3 3 3 3 2 3 3 3 1 1 2 1 2 2 3 3 3 1 3 1 1 1]


In [40]:
y_pred_test_RBF = svm_model_RBF.predict(test_X)
print('\nPredicted test data labels (SVM with RBF kernel): \n', y_pred_test_RBF[:25])


Predicted test data labels (SVM with RBF kernel): 
 [3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 3 3 3 3 3 1 3 1 1 1]


## Saving Predictions in .csv file:

In [43]:
np.savetxt("KNN_Predictions.csv", test_predictions, delimiter=",")
np.savetxt("SVM_linear_Predictions.csv", y_pred_test, delimiter=",")
np.savetxt("SVM_RBF_Predictions.csv", y_pred_test_RBF, delimiter=",")