# *Machine Learning with Python*

In [7]:
import numpy as np # Library for linear algebra and mathematics
import pandas as pd # Library for data manipulation

In [8]:
df = pd.read_csv('../Data Analysis/Data/train_cleaned.csv') # Read the data from the csv file
df.head() # Display the first 5 rows of the data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Gender,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,1,0,0,1
1,1,1,38.0,1,0,0,1,0,0
2,1,3,26.0,0,0,0,0,0,1
3,1,1,35.0,1,0,0,0,0,1
4,0,3,35.0,0,0,1,0,0,1


In [9]:
X, y = df.drop('Survived', axis=1), df['Survived'] # Split the data into features and target

In [10]:
from sklearn.model_selection import train_test_split # Import the train_test_split function from the sklearn library

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Split the data into training and testing sets

In [12]:
#Normalize the data using standard scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
#modelling 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [14]:
estimators = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'SVM': SVC(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=150),
    'Ada Boost': AdaBoostClassifier(),
    'Bagging': BaggingClassifier(),
    'Extra Trees': ExtraTreesClassifier()
}

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report

In [16]:
from sklearn import metrics
import time

In [17]:
results = pd.DataFrame({'Model': [], 'MSE': [], 'MAE': [], " % error": [], 'Accuracy Score': [], 'Time': []})

for model_name, model in estimators.items():
    
    start_time = time.time()
    model.fit(X_train, y_train)
    total_time = time.time() - start_time
        
    pred = model.predict(X_test)
    
    results = pd.concat([results, pd.DataFrame({"Model":    [model_name],
                                                 "MSE": [metrics.mean_squared_error(y_test, pred)],
                                                 "MAE": [metrics.mean_absolute_error(y_test, pred)],
                                                 " % error": [metrics.mean_squared_error(y_test, pred)], #/ rang,
                                                 "Accuracy Score": [model.score(X_test,y_test)],
                                                 "Time":     [total_time]})], ignore_index=True)

results_ord = results.sort_values(by=['MSE'], ascending=True, ignore_index=True)
results_ord.index += 1 
results_ord.style.bar(subset=['MSE', 'MAE'], vmin=0, vmax=100, color='#5fba7d')


Unnamed: 0,Model,MSE,MAE,% error,Accuracy Score,Time
1,Gradient Boosting,0.162921,0.162921,0.162921,0.837079,0.140649
2,SVM,0.185393,0.185393,0.185393,0.814607,0.015945
3,Random Forest,0.191011,0.191011,0.191011,0.808989,0.167586
4,Ada Boost,0.191011,0.191011,0.191011,0.808989,0.078792
5,Extra Trees,0.196629,0.196629,0.196629,0.803371,0.131648
6,KNN,0.207865,0.207865,0.207865,0.792135,0.002029
7,Decision Tree,0.207865,0.207865,0.207865,0.792135,0.001984
8,Bagging,0.219101,0.219101,0.219101,0.780899,0.020944
9,Logistic Regression,0.224719,0.224719,0.224719,0.775281,0.010992
10,Naive Bayes,0.258427,0.258427,0.258427,0.741573,0.000998


In [18]:
#importing GradientBoostingClassifier()
from sklearn.ensemble import GradientBoostingClassifier

In [19]:
model = GradientBoostingClassifier(n_estimators=150)
model.fit(X_train, y_train)
preds_val = model.predict(X_test)
acc_score = accuracy_score(y_test, preds_val)
print(f'The accuracy score of the model is: {(acc_score*100):.2f}%.')


The accuracy score of the model is: 83.71%.


In [20]:
#saving the model
import pickle
pickle.dump(model, open('model.pkl','wb'))

In [21]:
#loading the model
model = pickle.load(open('model.pkl','rb'))

In [22]:
#testing the model
print(model.predict([[3,35.0,0,0,1,0,0,1]]))

[1]


In [24]:
#getting imput from the user
Pclass = int(input("Enter the Passenger class: "))
Age = int(input("Enter the Age: "))
SibSp = int(input("Enter number of Siblings: "))
Parch = int(input("Enter the number of Parents: "))
Gender = input("Enter your Gender:")
Embarked = input("Enter the Embarked: ")
if Gender == 'male':
    Gender = 1
else:
    Gender = 0

if Embarked == 'S':
    Embarked_C = 0
    Embarked_Q = 0
    Embarked_S = 1
elif Embarked == 'C':
    Embarked_C = 1
    Embarked_Q = 0
    Embarked_S = 0
else:
    Embarked_C = 0
    Embarked_Q = 1
    Embarked_S = 0

#predicting the result
pred = model.predict([[Pclass, Age, SibSp, Parch, Gender, Embarked_C, Embarked_Q, Embarked_S]])
if pred == 1:
    print('You are likely to survive')
else:
    print('You are likely to die')



You are likely to survive
