In [1]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression

In [2]:
%matplotlib inline

In [3]:
spaceship_df = pd.read_csv('data/train.csv')

In [4]:
spaceship_df.head(8)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True


In [5]:
#sns.stripplot(data=spaceship_df, x="Age", y="Transported")

In [6]:
spaceship_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [7]:
spaceship_df = spaceship_df.dropna()

In [8]:
#sns.stripplot(data=spaceship_df, x="HomePlanet", y="Transported")


In [9]:
#Lets drop the column that are not meaningful
spaceship_df.drop(['PassengerId', 'Name'], axis='columns', inplace=True)

In [10]:
spaceship_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In [11]:
#Lets transform the categorical features to the datatype categorical
spaceship_df['HomePlanet'] = spaceship_df['HomePlanet'].astype('category')
spaceship_df['Cabin'] = spaceship_df['Cabin'].astype('category')
spaceship_df['Destination'] = spaceship_df['Destination'].astype('category')
spaceship_df['CryoSleep'] = spaceship_df['CryoSleep'].astype('category')
spaceship_df['Transported'] = spaceship_df['Transported'].astype('category')
spaceship_df['VIP'] = spaceship_df['VIP'].astype('category')

In [12]:
spaceship_df.dtypes

HomePlanet      category
CryoSleep       category
Cabin           category
Destination     category
Age              float64
VIP             category
RoomService      float64
FoodCourt        float64
ShoppingMall     float64
Spa              float64
VRDeck           float64
Transported     category
dtype: object

In [13]:
#Lets select all the categorical columns and assign a numerical value
cat_columns = spaceship_df.select_dtypes(['category']).columns
spaceship_df[cat_columns] = spaceship_df[cat_columns].apply(lambda x: x.cat.codes)
spaceship_df

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1,0,137,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0
1,0,0,1823,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1
2,1,0,1,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0
3,1,0,1,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0
4,0,0,1825,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,134,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0
8689,0,1,4293,1,18.0,0,0.0,0.0,0.0,0.0,0.0,0
8690,0,0,4298,2,26.0,0,0.0,0.0,1872.0,1.0,0.0,1
8691,1,0,1778,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0


Lets train and evaluate the model

In [14]:
classifier = LogisticRegression()
X = spaceship_df.drop(['Transported'], axis='columns')
y = spaceship_df['Transported']
model = classifier.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
test_data = pd.read_csv('data/test.csv')
#Lets drop the columns from test_data
test_data.drop(['PassengerId', 'Name'], axis='columns', inplace=True)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    4190 non-null   object 
 1   CryoSleep     4184 non-null   object 
 2   Cabin         4177 non-null   object 
 3   Destination   4185 non-null   object 
 4   Age           4186 non-null   float64
 5   VIP           4184 non-null   object 
 6   RoomService   4195 non-null   float64
 7   FoodCourt     4171 non-null   float64
 8   ShoppingMall  4179 non-null   float64
 9   Spa           4176 non-null   float64
 10  VRDeck        4197 non-null   float64
dtypes: float64(6), object(5)
memory usage: 367.7+ KB


In [16]:
#Lets change the categorical column to numeric
test_data['HomePlanet'] = test_data['HomePlanet'].astype('category')
test_data['Cabin'] = test_data['Cabin'].astype('category')
test_data['Destination'] = test_data['Destination'].astype('category')
test_data['CryoSleep'] = test_data['CryoSleep'].astype('category')
test_data['VIP'] = test_data['VIP'].astype('category')

cat_columns = test_data.select_dtypes(['category']).columns
test_data[cat_columns] = test_data[cat_columns].apply(lambda x: x.cat.codes)

In [17]:
test_data.head()

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0,1,2784,2,27.0,0,0.0,0.0,0.0,0.0,0.0
1,0,0,1867,2,19.0,0,0.0,9.0,0.0,2823.0,0.0
2,1,1,257,0,31.0,0,0.0,0.0,0.0,0.0,0.0
3,1,0,259,2,38.0,0,0.0,6652.0,0.0,181.0,585.0
4,0,0,1940,2,20.0,0,10.0,0.0,635.0,0.0,0.0


In [18]:
#Since we need to not have any NaN values lets replace them with the average (only for the numerical columns)
mean = test_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].mean()
mean

Age              28.658146
RoomService     219.266269
FoodCourt       439.484296
ShoppingMall    177.295525
Spa             303.052443
VRDeck          310.710031
dtype: float64

In [19]:
test_data.fillna(mean, inplace=True)

In [20]:
#Lets check for NaN for the dataframe related with the numerical columns ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
test_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].isnull().values.any()

False

In [21]:
''' Model prediction '''
#0 == false
#1 == true
prediction = model.predict(test_data)
prediction = np.where(prediction == 0, 'False', 'True')

In [22]:
prediction

array(['True', 'False', 'True', ..., 'True', 'True', 'True'], dtype='<U5')

In [23]:
transported_df = pd.DataFrame(data = prediction, columns = ['Transported'])

In [24]:
test_data_passenger_id = pd.read_csv('data/test.csv')['PassengerId']

In [25]:
result = transported_df.merge(test_data_passenger_id,left_index=True, right_index=True)

In [26]:
result = result.reindex(columns=['PassengerId', 'Transported'])

In [27]:
result

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [28]:
result.to_csv('result.csv', sep=',', encoding='utf-8', index=False)