In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix

In [30]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [31]:
train_data = train_data.loc[:, ~train_data.columns.str.contains('^Unnamed')]
test_data = test_data.loc[:, ~test_data.columns.str.contains('^Unnamed')]

In [32]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [33]:
test_data.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [34]:

# Split 'PassengerId' column in the train set
train_data[['GroupNumber', 'FamilySize']] = train_data['PassengerId'].apply(lambda x: pd.Series(str(x).split("_")))

# Split 'Cabin' column in the train set
train_data[['Deck', 'Cabin_num', 'Side']] = train_data['Cabin'].apply(lambda x: pd.Series(str(x).split("/")))

# Split 'PassengerId' column in the test set
test_data[['GroupNumber', 'FamilySize']] = test_data['PassengerId'].apply(lambda x: pd.Series(str(x).split("_")))

# Split 'Cabin' column in the test set
test_data[['Deck', 'Cabin_num', 'Side']] = test_data['Cabin'].apply(lambda x: pd.Series(str(x).split("/")))

In [35]:
# Compute 'FamilySize' in the train set
train_data['FamilySize'] = train_data.groupby('GroupNumber')['GroupNumber'].transform('count')

# Compute 'FamilySize' in the test set
test_data['FamilySize'] = test_data.groupby('GroupNumber')['GroupNumber'].transform('count')


In [36]:
train_data['FamilyType'] = np.where(train_data['FamilySize'] == 1, 'Single', np.where(train_data['FamilySize'] <= 4, 'Small', 'Large'))
test_data['FamilyType'] = np.where(test_data['FamilySize'] == 1, 'Single', np.where(test_data['FamilySize'] <= 4, 'Small', 'Large'))

In [37]:
train_data = train_data.drop(['Name', 'PassengerId', 'Cabin'], axis = 1)
test_data = test_data.drop(['Name', 'PassengerId', 'Cabin'], axis = 1)

In [38]:
from sklearn.impute import SimpleImputer

# Define the columns to impute
col = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Cabin_num', 'CryoSleep', 'VIP']

# Create an instance of SimpleImputer with strategy='mean'
imputer = SimpleImputer(strategy='mean')

# Impute missing values in the train_data
train_data[col] = imputer.fit_transform(train_data[col])

# Impute missing values in the test_data
test_data[col] = imputer.transform(test_data[col])


In [39]:
# Replacing the missing categorical values for train data
train_data['Destination'] = train_data['Destination'].fillna(train_data['Destination'].value_counts().index[0])
train_data['HomePlanet'] = train_data['HomePlanet'].fillna(train_data['HomePlanet'].value_counts().index[0])
train_data['Deck'] = train_data['Deck'].fillna(train_data['Deck'].value_counts().index[0])
train_data['Side'] = train_data['Side'].fillna(train_data['Side'].value_counts().index[0])


In [40]:
# Replacing the missing categorical values for test data

test_data['Destination'] = test_data['Destination'].fillna(test_data['Destination'].value_counts().index[0])
test_data['HomePlanet'] = test_data['HomePlanet'].fillna(test_data['HomePlanet'].value_counts().index[0])
test_data['Deck'] = test_data['Deck'].fillna(test_data['Deck'].value_counts().index[0])
test_data['Side'] = test_data['Side'].fillna(test_data['Side'].value_counts().index[0])


In [41]:
#Converting the Dataframes into a numerical values for train and test data
train_data['FamilySize'] = pd.to_numeric(train_data['FamilySize'], downcast='integer')
train_data['GroupNumber'] = pd.to_numeric(train_data['GroupNumber'], downcast='integer')

test_data['FamilySize'] = pd.to_numeric(test_data['FamilySize'], downcast='integer')
test_data['GroupNumber'] = pd.to_numeric(test_data['GroupNumber'], downcast='integer')


In [42]:
labels = train_data.columns
for col in labels:
    if train_data[col].dtype == 'O':
        encoder = LabelEncoder()
        train_data[col] = encoder.fit_transform(train_data[col])
        
    elif train_data[col].dtype == 'bool':
        train_data[col] = train_data[col].astype('int')

In [43]:
labels1 = test_data.columns
for col in labels1:
    if test_data[col].dtype == 'O':
        encoder = LabelEncoder()
        test_data[col] = encoder.fit_transform(test_data[col])
        
    elif test_data[col].dtype == 'bool':
        test_data[col] = test_data[col].astype('int')

In [44]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical columns in train_data
categorical_cols_train = train_data.select_dtypes(include=['object', 'bool']).columns
train_data[categorical_cols_train] = train_data[categorical_cols_train].apply(lambda x: LabelEncoder().fit_transform(x))

# Encode categorical columns in test_data
categorical_cols_test = test_data.select_dtypes(include=['object', 'bool']).columns
test_data[categorical_cols_test] = test_data[categorical_cols_test].apply(lambda x: LabelEncoder().fit_transform(x))

# Convert boolean columns to integers
bool_cols = train_data.select_dtypes(include='bool').columns
train_data[bool_cols] = train_data[bool_cols].astype(int)
test_data[bool_cols] = test_data[bool_cols].astype(int)


In [45]:
df = train_data.drop('Transported', axis = 1)
target = train_data[['Transported']]

In [46]:
from sklearn.preprocessing import RobustScaler

def robust_scale(X):
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled

norm_dataset = robust_scale(df)
norm_test = robust_scale(test_data)


In [47]:
def normalizeFeatures(X):
    mu=np.mean(X)
    sigma=np.std(X)
    X_norm=(X - mu)/sigma
    return X_norm

norm_dataset = normalizeFeatures(df)
norm_test = normalizeFeatures(test_data)


In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'


In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'



In [48]:
x_train, x_test, y_train, y_test = train_test_split(norm_dataset, target, test_size=0.2, random_state=10)

In [49]:
train_score = []
test_score = []

In [50]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import plotly.graph_objs as go

# Scale data
scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

# Logistic Regression
lr_param_grid = {
    'C': [0.01, 0.1, 1.0, 10, 100]
}

lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
lr_grid_search = GridSearchCV(lr_model, lr_param_grid, cv=5, scoring='accuracy')
lr_grid_search.fit(X_train, y_train)
lr_y_pred = lr_grid_search.predict(X_test)
lr_accuracy_test = accuracy_score(y_test, lr_y_pred)
lr_accuracy_train = lr_grid_search.best_score_
lr_best_params = lr_grid_search.best_params_

# Neural Network
nn_param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01]
}

nn_model = MLPClassifier(random_state=42)
nn_model.fit(X_train, y_train)
nn_grid_search = GridSearchCV(nn_model, nn_param_grid, cv=5, scoring='accuracy')
nn_grid_search.fit(X_train, y_train)
nn_y_pred = nn_grid_search.predict(X_test)
nn_accuracy_test = accuracy_score(y_test, nn_y_pred)
nn_accuracy_train = nn_grid_search.best_score_
nn_best_params = nn_grid_search.best_params_

# Linear Discriminant Analysis
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(X_train, y_train)
lda_y_pred = lda_model.predict(X_test)
lda_accuracy_test = accuracy_score(y_test, lda_y_pred)
lda_accuracy_train = lda_model.score(X_train, y_train)
lda_best_params = {}

# Results
results = {
    'Logistic Regression': {
        'accuracy_train': lr_accuracy_train,
        'accuracy_test': lr_accuracy_test,
        'best_params': lr_best_params
    },
    'Neural Network': {
        'accuracy_train': nn_accuracy_train,
        'accuracy_test': nn_accuracy_test,
        'best_params': nn_best_params
    },
    'Linear Discriminant Analysis': {
        'accuracy_train': lda_accuracy_train,
        'accuracy_test': lda_accuracy_test,
        'best_params': lda_best_params
    }
}

# Plot training accuracy
fig_train = go.Figure()
for algorithm, result in results.items():
    fig_train.add_trace(go.Bar(x=[algorithm], y=[result['accuracy_train']],
                              name='Training Accuracy'))

fig_train.update_layout(
    title='Parametric Models - Training Accuracy',
    xaxis_title='Algorithm',
    yaxis_title='Accuracy'
)

fig_train.show()

# Plot testing accuracy
fig_test = go.Figure()
for algorithm, result in results.items():
    fig_test.add_trace(go.Bar(x=[algorithm], y=[result['accuracy_test']],
                             name='Testing Accuracy'))

fig_test.update_layout(
    title='Parametric Models - Testing Accuracy',
    xaxis_title='Algorithm',
    yaxis_title='Accuracy'
)

fig_test.show()



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier

scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

# Random Forest
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=5, scoring='accuracy')
rf_grid_search.fit(X_train, y_train)
rf_y_pred = rf_grid_search.predict(X_test)
rf_accuracy_test = accuracy_score(y_test, rf_y_pred)
rf_accuracy_train = rf_grid_search.best_score_
rf_best_params = rf_grid_search.best_params_

# Gradient Boosting

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'max_depth': [3, 5, 7]
}

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_grid_search = GridSearchCV(gb_model, gb_param_grid, cv=5, scoring='accuracy')
gb_grid_search.fit(X_train, y_train)
gb_y_pred = gb_grid_search.predict(X_test)
gb_accuracy_test = accuracy_score(y_test, gb_y_pred)
gb_accuracy_train = gb_grid_search.best_score_
gb_best_params = gb_grid_search.best_params_


# AdaBoost
adaboost_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}

adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train, y_train)
adaboost_grid_search = GridSearchCV(adaboost_model, adaboost_param_grid, cv=5, scoring='accuracy')
adaboost_grid_search.fit(X_train, y_train)
adaboost_y_pred = adaboost_grid_search.predict(X_test)
adaboost_accuracy_test = accuracy_score(y_test, adaboost_y_pred)
adaboost_accuracy_train = adaboost_grid_search.best_score_
adaboost_best_params = adaboost_grid_search.best_params_

# Results
results = {
    'Random Forest': {
        'accuracy_train': rf_accuracy_train,
        'accuracy_test': rf_accuracy_test,
        'best_params': rf_best_params
    },
    'Gradient Boosting': {
        'accuracy_train': gb_accuracy_train,
        'accuracy_test': gb_accuracy_test,
        'best_params': gb_best_params
    },
    'AdaBoost': {
        'accuracy_train': adaboost_accuracy_train,
        'accuracy_test': adaboost_accuracy_test,
        'best_params': adaboost_best_params
    }
}

# Plot training accuracy
fig_train = go.Figure()
for algorithm, result in results.items():
    fig_train.add_trace(go.Bar(x=[algorithm], y=[result['accuracy_train']],
                              name='Training Accuracy'))

fig_train.update_layout(
    title='Non-Parametric Models - Training Accuracy',
    xaxis_title='Algorithm',
    yaxis_title='Accuracy'
)

fig_train.show()

# Plot testing accuracy
fig_test = go.Figure()
for algorithm, result in results.items():
    fig_test.add_trace(go.Bar(x=[algorithm], y=[result['accuracy_test']],
                             name='Testing Accuracy'))

fig_test.update_layout(
    title='Non-Parametric Models - Testing Accuracy',
    xaxis_title='Algorithm',
    yaxis_title='Accuracy'
)

fig_test.show()




A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().


A column-vector y was passed when a 1d array was expected. Pl

In [52]:
estimators = [
('Logistic Regression', lr_grid_search.best_estimator_),
('Neural Network', nn_grid_search.best_estimator_),
('Linear Discriminant Analysis', lda_model),
('Random Forest', rf_grid_search.best_estimator_),
('Gradient Boosting', gb_grid_search.best_estimator_),
('AdaBoost', adaboost_grid_search.best_estimator_)
]

stacking_model = StackingClassifier(estimators=estimators, final_estimator=GradientBoostingClassifier())
stacking_model.fit(X_train, y_train)
stacking_y_pred = stacking_model.predict(X_test)
stacking_accuracy_test = accuracy_score(y_test, stacking_y_pred)
stacking_accuracy_train = stacking_model.score(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



In [53]:
print("Training Accuracies:")
print("Logistic Regression:", lr_accuracy_train)
print("Neural Network:", nn_accuracy_train)
print("Linear Discriminant Analysis:", lda_accuracy_train)
print("Random Forest:", rf_accuracy_train)
print("Gradient Boosting:", gb_accuracy_train)
print("AdaBoost:", adaboost_accuracy_train)
print("Stacking Classifier:", stacking_accuracy_train)

print("Testing Accuracies:")
print("Logistic Regression:", lr_accuracy_test)
print("Neural Network:", nn_accuracy_test)
print("Linear Discriminant Analysis:", lda_accuracy_test)
print("Random Forest:", rf_accuracy_test)
print("Gradient Boosting:", gb_accuracy_test)
print("AdaBoost:", adaboost_accuracy_test)
print("Stacking Classifier:", stacking_accuracy_test)

Training Accuracies:
Logistic Regression: 0.7904683648315529
Neural Network: 0.8072308956450287
Linear Discriminant Analysis: 0.771076417419885
Random Forest: 0.7993426458504519
Gradient Boosting: 0.8095316351684471
AdaBoost: 0.7967132292522596
Stacking Classifier: 0.8700082169268694
Testing Accuracies:
Logistic Regression: 0.7841257668711656
Neural Network: 0.7944785276073619
Linear Discriminant Analysis: 0.7603527607361963
Random Forest: 0.7848926380368099
Gradient Boosting: 0.7879601226993865
AdaBoost: 0.7829754601226994
Stacking Classifier: 0.7910276073619632


In [54]:
import plotly.graph_objs as go

# Create a dictionary of the results
results = {
    'Model': ['Logistic Regression', 'Neural Network', 'Linear Discriminant Analysis', 'Random Forest', 'Gradient Boosting', 'AdaBoost', 'Stacking Classifier'],
    'Training Accuracy': [lr_accuracy_train, nn_accuracy_train, lda_accuracy_train, rf_accuracy_train, gb_accuracy_train, adaboost_accuracy_train, stacking_accuracy_train],
    'Testing Accuracy': [lr_accuracy_test, nn_accuracy_test, lda_accuracy_test, rf_accuracy_test, gb_accuracy_test, adaboost_accuracy_test, stacking_accuracy_test]
}

# Create a pandas DataFrame from the results dictionary
df_results = pd.DataFrame(results)

# Display the table
print(df_results)

# Plot the graph
fig = go.Figure()

# Add training accuracy trace
fig.add_trace(go.Bar(
    x=df_results['Model'],
    y=df_results['Training Accuracy'],
    name='Training Accuracy'
))

# Add testing accuracy trace
fig.add_trace(go.Bar(
    x=df_results['Model'],
    y=df_results['Testing Accuracy'],
    name='Testing Accuracy'
))

# Update the layout
fig.update_layout(
    title='Model Comparison',
    xaxis_title='Model',
    yaxis_title='Accuracy',
    barmode='group'
)

# Show the graph
fig.show()


                          Model  Training Accuracy  Testing Accuracy
0           Logistic Regression           0.790468          0.784126
1                Neural Network           0.807231          0.794479
2  Linear Discriminant Analysis           0.771076          0.760353
3                 Random Forest           0.799343          0.784893
4             Gradient Boosting           0.809532          0.787960
5                      AdaBoost           0.796713          0.782975
6           Stacking Classifier           0.870008          0.791028


In [55]:
test = pd.read_csv("test.csv")
final_sub = pd.DataFrame(test.PassengerId)
predict1 = gb_model.predict(norm_test)
n_predictions = (predict1 > 0.5).astype(bool)
transported = []
for res in predict1:
    if res == 1:
        transported.append(True)
    else:
        transported.append(False)
final_sub['Transported'] = transported
final_sub.to_csv('CS559_Group6_Submission.csv', index=False)



X has feature names, but GradientBoostingClassifier was fitted without feature names

