In [45]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OrdinalEncoder,LabelEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from scipy.stats.mstats import winsorize
from scipy.stats import trim_mean
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.drop(['Name'],axis=1,inplace=True)

### Handling the Null values


In [4]:
numerical_col = data.select_dtypes(exclude=['object','bool']).columns.tolist()
numerical_col

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [5]:
imuter = SimpleImputer(strategy='mean')
num_without_nulls = pd.DataFrame(imuter.fit_transform(data[numerical_col]),columns=numerical_col)
num_without_nulls.isnull().sum()

Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [6]:
data[numerical_col] = num_without_nulls
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

In [7]:
categorical_col = data.select_dtypes(include=['object','bool']).columns.tolist()
categorical_col

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'Transported']

In [8]:
imuter = SimpleImputer(strategy='most_frequent')
cat_without_nulls = pd.DataFrame(imuter.fit_transform(data[categorical_col]),columns=categorical_col)
cat_without_nulls.isnull().sum()


PassengerId    0
HomePlanet     0
CryoSleep      0
Cabin          0
Destination    0
VIP            0
Transported    0
dtype: int64

In [9]:
data[categorical_col] = cat_without_nulls
data.isnull().sum()


PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

## Extracting the group from the PassengerID¶


In [10]:
def Group(data):    
    group = []
    for id in data.PassengerId:
        group.append(int(id.split('_')[0]))
    return group

In [11]:
group = Group(data)

In [12]:
data.PassengerId = group
data.rename(columns={'PassengerId':'Group'},inplace=True)
data.Group

0          1
1          2
2          3
3          3
4          4
        ... 
8688    9276
8689    9278
8690    9279
8691    9280
8692    9280
Name: Group, Length: 8693, dtype: int64

# Splitting the Cabines into 3 parts (deck,num,side)¶


In [13]:
def Cabines(data):
    deck = []
    num = []
    side = []
    for cabin in data.Cabin:
        deck.append(cabin.split('/')[0])
        num.append(int(cabin.split('/')[1]))
        side.append(cabin.split('/')[-1])
    return deck,num,side

In [14]:
deck,num,side = Cabines(data)

In [15]:
cabin_df = pd.DataFrame({'Deck':deck,'Num':num,'Side':side})
cabin_df

Unnamed: 0,Deck,Num,Side
0,B,0,P
1,F,0,S
2,A,0,S
3,A,0,S
4,F,1,S
...,...,...,...
8688,A,98,P
8689,G,1499,S
8690,G,1500,S
8691,E,608,S


In [16]:
data.drop('Cabin',axis=1,inplace=True)
data = pd.concat([data,cabin_df],axis=1)
data.head()

Unnamed: 0,Group,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,1,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,2,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,3,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


# Defining the final version of numerical,categorical features¶


In [18]:
categorical_col.extend(['Deck','Side'])
categorical_col.remove('Cabin')
categorical_col.remove('PassengerId')
numerical_col.extend(['Num','Group'])
print(f'Categorical columns : {categorical_col}, Numerical columns: {numerical_col}')

Categorical columns : ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'Deck', 'Side'], Numerical columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num', 'Group']


## Normalizing and encoding

In [19]:
data[numerical_col] = StandardScaler().fit_transform(data[numerical_col])
data.describe()

Unnamed: 0,Group,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,-2.615595e-17,-1.76144e-16,5.98726e-17,7.356361000000001e-17,7.724179000000001e-17,-5.803351e-17,-6.436816000000001e-17,-1.177018e-16
std,1.000058,1.000058,1.000058,1.000058,1.000058,1.000058,1.000058,1.000058
min,-1.734409,-2.010564,-0.3405899,-0.287314,-0.290817,-0.2766634,-0.2690226,-1.191744
25%,-0.8665285,-0.6156918,-0.3405899,-0.287314,-0.290817,-0.2766634,-0.2690226,-0.8500758
50%,-0.001269106,-0.1274865,-0.3405899,-0.287314,-0.290817,-0.2766634,-0.2690226,-0.3069617
75%,0.8422746,0.5699497,-0.2223546,-0.2133024,-0.2154885,-0.197525,-0.2063679,0.7496421
max,1.739733,3.499182,21.37681,18.41192,39.03403,19.64845,21.02742,2.548831


In [20]:
for col in categorical_col:
    data[col] = LabelEncoder().fit_transform(data[col])

In [21]:
data.select_dtypes(include=['object','bool']).sum()


Series([], dtype: float64)

In [22]:
X_train = data.drop('Transported',axis=1)
y_train = data.Transported

In [23]:
test = pd.read_csv('test.csv')
test.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,4186.0,4195.0,4171.0,4179.0,4176.0,4197.0
mean,28.658146,219.266269,439.484296,177.295525,303.052443,310.710031
std,14.179072,607.011289,1527.663045,560.821123,1117.186015,1246.994742
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,53.0,78.0,33.0,50.0,36.0
max,79.0,11567.0,25273.0,8292.0,19844.0,22272.0


In [24]:
ID  = test.PassengerId


### Repeat the same steps with the test data

In [25]:
test.drop(['Name'],axis=1,inplace=True)

numerical_col = test.select_dtypes(exclude=['object','bool']).columns.tolist()

imuter = SimpleImputer(strategy='mean')
num_without_nulls = pd.DataFrame(imuter.fit_transform(test[numerical_col]),columns=numerical_col)
test[numerical_col] = num_without_nulls

categorical_col = test.select_dtypes(include=['object','bool']).columns.tolist()

imuter = SimpleImputer(strategy='most_frequent')
cat_without_nulls = pd.DataFrame(imuter.fit_transform(test[categorical_col]),columns=categorical_col)
test[categorical_col] = cat_without_nulls

group = Group(test)
test.PassengerId = group
test.rename(columns={'PassengerId':'Group'},inplace=True)

deck,num,side = Cabines(test)
cabin_df = pd.DataFrame({'Deck':deck,'Num':num,'Side':side})
test.drop('Cabin',axis=1,inplace=True)
test = pd.concat([test,cabin_df],axis=1)

categorical_col.extend(['Deck','Side'])
categorical_col.remove('Cabin')
categorical_col.remove('PassengerId')
numerical_col.extend(['Num','Group'])

test[numerical_col] = StandardScaler().fit_transform(test[numerical_col])

for col in categorical_col:
    test[col] = LabelEncoder().fit_transform(test[col])

In [26]:
X_test = test

In [46]:

# Define the parameter grid to search
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'max_features': [None, 'sqrt', 'log2']
}

In [47]:
# Initialize the classifier
gb_classifier = GradientBoostingClassifier()

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters found: ", best_parameters)
print("Best score found: ", best_score)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best parameters found:  {'learning_rate': 0.01, 'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best score found:  0.758430602088022


In [48]:
# Best hyperparameters from grid search
best_params = {
    'learning_rate': 0.01,
    'max_depth': 3,
    'max_features': 'log2',
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 300
}

In [49]:
# Initialize the classifier with the best parameters
gb_classifier_best = GradientBoostingClassifier(**best_params)

In [50]:

gb_classifier_best.fit(X_train,y_train)
y_pred = gb_classifier_best.predict(X_test)

In [51]:
y_pred.tolist()

[1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,


In [52]:
y_predediction = y_pred.astype('bool')

In [53]:
Submission = pd.DataFrame({'PassengerId':ID,'Transported':y_predediction})
Submission.to_csv('Submissionv2.csv',index=False)
Submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [40]:

# Define the path where to save the models
save_path = 'autogluon_models'


# Initialize the TabularPredictor with the target column name and evaluation metric
predictor = TabularPredictor(label='Transported', eval_metric='accuracy')

# Fit the predictor on your data, only including the specified models
predictor.fit(data, hyperparameters='default')

No path specified. Models will be saved in: "AutogluonModels\ag-20240314_152121"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20240314_152121"
AutoGluon Version:  1.0.0
Python Version:     3.10.9
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          16
Memory Avail:       16.42 GB / 31.95 GB (51.4%)
Disk Space Avail:   68.

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x212b668c970>

In [41]:
# Get the leaderboard of models, sorted by their performance
leaderboard = predictor.leaderboard()

In [42]:
leaderboard

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,CatBoost,0.824138,accuracy,0.002001,1.816094,0.002001,1.816094,1,True,7
1,WeightedEnsemble_L2,0.824138,accuracy,0.003003,2.342572,0.001002,0.526478,2,True,14
2,LightGBM,0.818391,accuracy,0.006005,0.749681,0.006005,0.749681,1,True,4
3,LightGBMXT,0.812644,accuracy,0.005005,0.479435,0.005005,0.479435,1,True,3
4,XGBoost,0.812644,accuracy,0.005005,0.645586,0.005005,0.645586,1,True,11
5,LightGBMLarge,0.810345,accuracy,0.008007,1.448089,0.008007,1.448089,1,True,13
6,NeuralNetFastAI,0.809195,accuracy,0.013012,5.083018,0.013012,5.083018,1,True,10
7,NeuralNetTorch,0.801149,accuracy,0.010009,10.476646,0.010009,10.476646,1,True,12
8,RandomForestEntr,0.791954,accuracy,0.047043,0.57252,0.047043,0.57252,1,True,6
9,ExtraTreesGini,0.781609,accuracy,0.048044,0.400174,0.048044,0.400174,1,True,8


In [43]:
predictions = predictor.predict(test)

In [44]:
submission = pd.DataFrame({'PassengerId':ID,'Transported':predictions})
submission['Transported'] = predictions.astype(bool)
submission.to_csv('Submissionv3.csv',index=False)
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
