<a href="https://colab.research.google.com/github/Abdelhak-mekaoui/Spaceship-Titanic/blob/main/Copy_of_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spaceship Titanic

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')


In [None]:
X = train_data.iloc[:,1:-2].values
y = train_data.iloc[:,-1].values
X_test = test_data.iloc[:,1:-1].values

# Dealing with missing values

In [None]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
imputer.fit(X)
X = imputer.transform(X)
X_test = imputer.transform(X_test)

In [None]:
print(X.shape)

(8693, 11)


# OneHotEncoding & feature scaling

In [None]:
categorical_columns = [0,1,2,3,5]
numerical_columns = [4,6,7,8,9,10]

In [None]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer
#Encoding and scaling the input variable
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(handle_unknown='ignore', sparse=False),categorical_columns)
                                     ,('scaler',StandardScaler(),numerical_columns)]
                       ,remainder='passthrough')
ct.fit(np.concatenate((X,X_test)))
X = np.array(ct.transform(X))
X_test = np.array(ct.transform(X_test))
#Encoding the dependent variable
le = LabelEncoder()
y = le.fit_transform(y)


In [None]:
print(X.shape)

(8693, 9841)


In [None]:
df1 = pd.DataFrame(X)
df2 = pd.DataFrame(X_test)

# Remove duplicate columns
df1 = df1.loc[:, ~df1.columns.duplicated()]
df2 = df2.loc[:, ~df2.columns.duplicated()]
# Convert pandas dataframe back to numpy array
X = df1.to_numpy()
X_test = df2.to_numpy()

In [None]:
print(X.shape)

(8693, 9841)


# Spliting the data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Models


**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
classifier1 = LogisticRegression(random_state = 0)
classifier1.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred_train = classifier1.predict(X_train) 
y_pred_cv = classifier1.predict(X_cv)

print(confusion_matrix(y_train, y_pred_train))
accuracy_score(y_train, y_pred_train)
print(confusion_matrix(y_cv, y_pred_cv))
accuracy_score(y_cv, y_pred_cv)

[[2901  551]
 [ 413 3089]]
[[661 202]
 [184 692]]


0.7780333525014376

**Random forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier2 = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
classifier2.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=80, random_state=0)

In [None]:
y_pred_train2 = classifier2.predict(X_train) 
y_pred_cv2 = classifier2.predict(X_cv)

print(confusion_matrix(y_train, y_pred_train2))
accuracy_score(y_train, y_pred_train2)
print(confusion_matrix(y_cv, y_pred_cv2))
accuracy_score(y_cv, y_pred_cv2)

[[3449    3]
 [   3 3499]]
[[684 179]
 [209 667]]


0.7768832662449684

**Naive Bayes** 

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier3 = GaussianNB()
classifier3.fit(X_train, y_train)

GaussianNB()

In [None]:
y_pred_train3 = classifier3.predict(X_train) 
y_pred_cv3 = classifier3.predict(X_cv)

print("Results on training set")
print(confusion_matrix(y_train, y_pred_train3))
accuracy_score(y_train, y_pred_train3)
print("Results on cross validation set")
print(confusion_matrix(y_cv, y_pred_cv3))
accuracy_score(y_cv, y_pred_cv3)

Results on training set
[[2782  670]
 [   1 3501]]
Results on cross validation set
[[ 62 801]
 [ 84 792]]


0.49108683151236343

**K-Nearest Neighbors**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier4 = KNeighborsClassifier(n_neighbors = 10, metric = 'minkowski', p = 2)
classifier4.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [None]:
y_pred_train4 = classifier4.predict(X_train) 
y_pred_cv4 = classifier4.predict(X_cv)

print("Results on training set")
print(confusion_matrix(y_train, y_pred_train4))
accuracy_score(y_train, y_pred_train4)
print("Results on cross validation set")
print(confusion_matrix(y_cv, y_pred_cv4))
accuracy_score(y_cv, y_pred_cv4)

Results on training set
[[2888  564]
 [ 755 2747]]
Results on cross validation set
[[678 185]
 [208 668]]


0.7740080506037953

**Kernel SVM**

In [None]:
from sklearn.svm import SVC
classifier5 = SVC(kernel = 'rbf', random_state = 0)
classifier5.fit(X_train, y_train)

SVC(random_state=0)

In [None]:
y_pred_train5 = classifier5.predict(X_train) 
y_pred_cv5 = classifier5.predict(X_cv)

print("Results on training set")
print(confusion_matrix(y_train, y_pred_train5))
accuracy_score(y_train, y_pred_train5)
print("Results on cross validation set")
print(confusion_matrix(y_cv, y_pred_cv5))
accuracy_score(y_cv, y_pred_cv5)

Results on training set
[[2807  645]
 [ 592 2910]]
Results on cross validation set
[[670 193]
 [198 678]]


0.7751581368602645

**Support Vector Machine**

In [None]:
from sklearn.svm import SVC
classifier6 = SVC(kernel = 'linear', random_state = 0)
classifier6.fit(X_train, y_train)

SVC(kernel='linear', random_state=0)

In [None]:
y_pred_train6 = classifier6.predict(X_train) 
y_pred_cv6 = classifier6.predict(X_cv)

print("Results on training set")
print(confusion_matrix(y_train, y_pred_train6))
accuracy_score(y_train, y_pred_train6)
print("Results on cross validation set")
print(confusion_matrix(y_cv, y_pred_cv6))
accuracy_score(y_cv, y_pred_cv6)

Results on training set
[[3320  132]
 [ 184 3318]]
Results on cross validation set
[[659 204]
 [198 678]]


0.7688326624496837

**Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier7 = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier7.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [None]:
y_pred_train7 = classifier7.predict(X_train) 
y_pred_cv7 = classifier7.predict(X_cv)

print("Results on training set")
print(confusion_matrix(y_train, y_pred_train7))
accuracy_score(y_train, y_pred_train7)
print("Results on cross validation set")
print(confusion_matrix(y_cv, y_pred_cv7))
accuracy_score(y_cv, y_pred_cv7)

Results on training set
[[3452    0]
 [   5 3497]]
Results on cross validation set
[[605 258]
 [177 699]]


0.7498562392179413