<a href="https://colab.research.google.com/github/27priyanshu/Machine-Learning-Models/blob/main/Titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split

**Data Handling**

In [2]:
data=pd.read_csv("titanic.csv")


In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
#Using drop function to exclude collums I won't need. The parameter inplace true will change the table
data.drop(["PassengerId", "Name","SibSp","Parch","Ticket","Fare","Cabin","Embarked"], axis="columns",inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


In [5]:
# To define X I can simply drop the collumn I don't need, which is the Y
# Here I am not using the parameter inplace, thus it is automaticaly set as "False" and doesn't change the table
X = data.drop(["Survived"],axis="columns")
# For the variable Y I can simply select the column Survived
Y = data["Survived"] #Another way to declare: Y = data.Survived
Y


0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [6]:
#The next step: create a pattern
#Pattern: Male = 0 / Female = 1
#This method map will create synonims in a dictionary
X.Sex = X.Sex.map({"male":0,"female":1})
X.head()


Unnamed: 0,Pclass,Sex,Age
0,3,0,22.0
1,1,1,38.0
2,3,1,26.0
3,1,1,35.0
4,3,0,35.0


In [9]:
#Some filles with the ages are empty. Checkin out the first 10 lines
X.Age[0:10]


0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [10]:
# To fix this, we will use method fillna(). Inside the brackets I'll put the information to replace, which will be the mean
X.Age = X.Age.fillna(X.Age.mean())
X.Age[0:10]
#Index 5 was empty and is now showing the mean (29.699118)

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: Age, dtype: float64

In [11]:
#Now want data to training (80%) and testing (20%)
#Sklearn has a method to automaticaly make this division"train_test_split"
#Parameters: X, Y, percentage. If I put one, the method will automaticaly calculate the other
train_test_split(X,Y,train_size = 0.8) # It is automaticaly setting 0.2 for testing
# The table will show X_training [712 rows x 3 columns], X_test [179 rows x 3 columns], Y_training[Length: 712, Y_test[Le

[     Pclass  Sex        Age
 829       1    1  62.000000
 597       3    0  49.000000
 828       3    0  29.699118
 541       3    1   9.000000
 680       3    1  29.699118
 ..      ...  ...        ...
 585       1    1  18.000000
 487       1    0  58.000000
 577       1    1  39.000000
 92        1    0  46.000000
 569       3    0  32.000000
 
 [712 rows x 3 columns],
      Pclass  Sex        Age
 382       3    0  32.000000
 723       2    0  50.000000
 334       1    1  29.699118
 757       2    0  18.000000
 537       1    1  30.000000
 ..      ...  ...        ...
 528       3    0  39.000000
 25        3    1  38.000000
 606       3    0  30.000000
 508       3    0  28.000000
 399       2    1  28.000000
 
 [179 rows x 3 columns],
 829    1
 597    0
 828    1
 541    0
 680    0
       ..
 585    1
 487    0
 577    1
 92     0
 569    1
 Name: Survived, Length: 712, dtype: int64,
 382    0
 723    0
 334    1
 757    0
 537    1
       ..
 528    0
 25     1
 606    0
 508  

In [12]:
# Now I create 4 variables to place each parameter
X_train,X_test,Y_train,Y_real=train_test_split(X,Y,train_size = 0.8)
#to see the lenght:
len(X_train)


712

In [13]:
len(X_test)

179

In [14]:
#Note that X_test is not in order. It presents it ramdomly to prevent model from becoming biased
X_test

Unnamed: 0,Pclass,Sex,Age
410,3,0,29.699118
766,1,0,29.699118
155,1,0,51.000000
186,3,1,29.699118
522,3,0,29.699118
...,...,...,...
457,1,1,29.699118
598,3,0,29.699118
616,3,0,34.000000
744,3,0,31.000000


## **Data Analysis**

Linear Regression
