In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
print(train_df.columns)

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')


In [4]:
print(test_df.columns)

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')


In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load train and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
print(test_data.shape)
# Drop irrelevant columns
train_data = train_data.drop(['Name', 'Cabin'], axis=1)
test_data = test_data.drop(['Name', 'Cabin'], axis=1)

# Handle missing values
train_data = train_data.fillna('0')
test_data = test_data.fillna('0')
print(test_data.shape)

# Encode categorical variables
le_homeplanet = LabelEncoder()
le_destination = LabelEncoder()

train_data['HomePlanet'] = le_homeplanet.fit_transform(train_data['HomePlanet'])
train_data['Destination'] = le_destination.fit_transform(train_data['Destination'])

# Apply the same encoding to the test set
test_data['HomePlanet'] = le_homeplanet.transform(test_data['HomePlanet'])
test_data['Destination'] = le_destination.transform(test_data['Destination'])

# Split the data into features and target variable
X = train_data.drop('Transported', axis=1)
y = train_data['Transported']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


(4277, 13)
(4277, 11)


In [6]:
import numpy as np

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for x in X_test:
            distances = np.linalg.norm(self.X_train - x, axis=1)
            indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train.iloc[indices]
            prediction = np.bincount(k_nearest_labels).argmax()
            predictions.append(prediction)
        return np.array(predictions)


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Handmade KNN Classifier
knn_handmade = KNNClassifier(k=3)
knn_handmade.fit(X_train, y_train)
y_pred_handmade = knn_handmade.predict(X_test)

# Scikit-learn KNN Classifier
knn_sklearn = KNeighborsClassifier(n_neighbors=3)
knn_sklearn.fit(X_train, y_train)
y_pred_sklearn = knn_sklearn.predict(X_test)

# Evaluate accuracy
accuracy_handmade = accuracy_score(y_test, y_pred_handmade)
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)

# Confusion matrix
conf_matrix_handmade = confusion_matrix(y_test, y_pred_handmade)
conf_matrix_sklearn = confusion_matrix(y_test, y_pred_sklearn)

print(f"Accuracy (Handmade): {accuracy_handmade}")
print(f"Accuracy (Scikit-learn): {accuracy_sklearn}")

print("\nConfusion Matrix (Handmade):")
print(conf_matrix_handmade)

print("\nConfusion Matrix (Scikit-learn):")
print(conf_matrix_sklearn)


Accuracy (Handmade): 0.7446808510638298
Accuracy (Scikit-learn): 0.7446808510638298

Confusion Matrix (Handmade):
[[629 232]
 [212 666]]

Confusion Matrix (Scikit-learn):
[[629 232]
 [212 666]]


In [8]:
passenger_id = test_data['PassengerId']

# Drop only the columns that exist in the test data
columns_to_drop = ['Name', 'Cabin']
test_data = test_data.drop(columns_to_drop, axis=1, errors='ignore')

# Get the feature names used during training
feature_names_train = X.columns.tolist()

# Keep only the features used during training in the test set
test_data = test_data[feature_names_train]

# Assuming knn_handmade is your trained handmade KNN model
predictions = knn_handmade.predict(scaler.transform(test_data))

# Map 0 and 1 to 'False' and 'True'
predictions_mapped = np.where(predictions == 1, 'True', 'False')

# Select the first 4277 rows
result_df = pd.DataFrame({'PassengerId': passenger_id[:4277], 'Transported': predictions_mapped[:4277]})

# Save the DataFrame to a new CSV file
result_df.to_csv('predicted_transportation12.csv', index=False)
print('smthing')

smthing


In [9]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('predicted_transportation12.csv')

# Get the number of rows
num_rows = len(df)

# Print the result
print(f'The CSV file has {num_rows} rows.')


The CSV file has 4277 rows.
