In [1]:
#importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression

%matplotlib inline

In [2]:
#Loading the train and test dataset
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#Checking for null values
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
#replacing the null values in the age column with the mean of the ages
train['Age'] = train['Age'].replace(np.nan, train['Age'].mean().round(decimals=1))
train['Age'].isnull().sum()

0

In [5]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
#replacing the null values in age with the mean of the ages in the test set
test['Age'] = test['Age'].replace(np.nan, test['Age'].mean().round(decimals=1))
test['Age'].isnull().sum(), train['Age'].isnull().sum()

(0, 0)

In [7]:
#Combine test and train dataset
train['source'] = "train"
test['source'] = "test"
data = pd.concat([train, test], ignore_index=True)
print(data.shape)

(1309, 13)


In [8]:
data.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,source
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


In [9]:
#transforming catergorical data to numerical data using the labelencoder function
#Importing and initializing the labelencoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [10]:
data['Sex'].unique(), data['Embarked'].unique()

(array(['male', 'female'], dtype=object),
 array(['S', 'C', 'Q', nan], dtype=object))

In [11]:
data['Embarked'].value_counts(), data['Embarked'].mode()

(S    914
 C    270
 Q    123
 Name: Embarked, dtype: int64, 0    S
 dtype: object)

In [12]:
data['Embarked'] = data['Embarked'].replace(np.nan, 'S', regex=True)

In [13]:
#transforming the sex column
data['Sex'] = le.fit_transform(data['Sex'])
data['Embarked'] = le.fit_transform(data['Embarked'])

In [14]:
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,source
0,1,0.0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0,train
2,3,1.0,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2,train


In [15]:
data.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex              int64
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked         int64
source          object
dtype: object

In [16]:
data['Parch'].value_counts(), data['Pclass'].value_counts(), data['SibSp'].value_counts(), data['Embarked'].value_counts()

(0    1002
 1     170
 2     113
 3       8
 5       6
 4       6
 6       2
 9       2
 Name: Parch, dtype: int64, 3    709
 1    323
 2    277
 Name: Pclass, dtype: int64, 0    891
 1    319
 2     42
 4     22
 3     20
 8      9
 5      6
 Name: SibSp, dtype: int64, 2    916
 0    270
 1    123
 Name: Embarked, dtype: int64)

In [17]:
#Performing one-hot encoding
data = pd.get_dummies(data, columns=["Parch", "SibSp"])

In [18]:
data.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex              int64
Age            float64
Ticket          object
Fare           float64
Cabin           object
Embarked         int64
source          object
Parch_0          uint8
Parch_1          uint8
Parch_2          uint8
Parch_3          uint8
Parch_4          uint8
Parch_5          uint8
Parch_6          uint8
Parch_9          uint8
SibSp_0          uint8
SibSp_1          uint8
SibSp_2          uint8
SibSp_3          uint8
SibSp_4          uint8
SibSp_5          uint8
SibSp_8          uint8
dtype: object

In [19]:
import warnings
warnings.filterwarnings("ignore")

#divide the data set
train = data.loc[data['source'] == 'train']
test = data.loc[data['source'] == 'test']
#drop unnecessary columns
test.drop(["Name", "source"], axis=1, inplace=True)
train.drop(["source"], axis=1, inplace=True)
#Export modified version of the file
train.to_csv("train_mod.csv", index=False)
test.to_csv("test_mod.csv", index=False)

In [20]:
#reading the modified files
train_2 = pd.read_csv("train_mod.csv")
test_2 = pd.read_csv("test_mod.csv")

In [21]:
#creating the independent variable X and dependent Y variable for the train set and dropping the irrelevant variables
x_train = train_2.drop(['Name', 'Ticket', 'Fare', 'Cabin', 'Survived'], axis=1)
y_train = train_2.Survived

In [23]:
#dropping the irrelevant variables for x_test
x_test = test_2.drop(['Ticket', 'Fare', 'Cabin', 'Survived'], axis=1)

In [24]:
#test split
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x_train, y_train, test_size=0.3, random_state=42) 

In [25]:
#initializing the LinearRegression function
#fitting the x_train and y_train
lrg = LinearRegression()

lrg.fit(xtrain, ytrain)

LinearRegression()

In [26]:
#finding the coefficient and intercept of the line
print(lrg.coef_)
print(lrg.intercept_)

[ 3.00053641e-05 -1.46350956e-01 -4.95596516e-01 -4.07030302e-03
 -4.81491519e-02  1.29095732e-01  2.39451695e-01  1.56947213e-01
  4.44971763e-01 -3.17190283e-01 -2.46291724e-01 -4.06984396e-01
 -8.32667268e-17  1.60414705e-01  1.64405089e-01  1.12578411e-01
 -1.49804532e-01  3.23764626e-03 -1.26287409e-01 -1.64543910e-01]
0.9325798672369241


In [27]:
#predict the test result of the test data
preds = lrg.predict(xtest)
preds

array([ 0.30220307,  0.22451654,  0.13497107,  0.9406581 ,  0.73124379,
        0.88234473,  0.6230319 ,  0.08820326,  0.67603456,  0.9340675 ,
        0.34137444,  0.03296302,  0.38057046,  0.17268314,  0.20131451,
        1.05029674,  0.33554552,  0.63362379,  0.25421015,  0.29587088,
        0.10230692,  0.35889663,  0.61918479,  0.13023022,  0.08642751,
       -0.10732377,  0.41826088,  0.23153779, -0.10558601,  0.57247187,
        0.14015157,  0.60145323,  0.48307909,  0.5729324 ,  0.12402826,
        0.20016645,  0.37059617,  0.61997135,  0.97892478,  0.09245858,
        0.20781156,  0.09143678,  0.09539911,  0.14213801,  0.72112171,
        0.44849465,  0.11584722,  0.10805753,  0.09978647,  0.39172028,
        0.838678  ,  0.90676604, -0.4031332 ,  0.49500737, -0.01684642,
        0.95161021,  0.20937426,  0.99158177,  0.69699062,  0.65890277,
        0.11793845,  0.90394451,  0.73592428,  0.36802743,  0.12431482,
        0.62427696,  0.2542076 ,  0.0857961 ,  0.34858497,  0.88

In [28]:
import math
print(math.sqrt(mean_squared_error(ytest, preds)))

0.3736364597342136


In [29]:
y_Survived_preds = lrg.predict(x_test)
y_Survived_preds

array([ 0.1256311 ,  0.52622006,  0.16010873,  0.10809923,  0.73842362,
        0.16107318,  0.63972401,  0.37298686,  0.73677681,  0.08486479,
        0.09487727,  0.32370543,  0.91693931,  0.11220972,  0.81931205,
        0.86290637,  0.27042699,  0.22923943,  0.60813621,  0.62720869,
        0.38763144,  0.29226075,  0.88353577,  0.63247734,  1.22771613,
        0.01913277,  1.12409357,  0.22343403,  0.34456704,  0.14390938,
        0.16563375,  0.21966496,  0.61198595,  0.62300578,  0.48980909,
        0.23995529,  0.59125393,  0.62913775,  0.11729003,  0.09574743,
        0.26702006,  0.38850935,  0.0522852 ,  0.73903601,  0.82835281,
        0.11750007,  0.42512423,  0.14413662,  0.85972621,  0.59629475,
        0.41841269,  0.35218875,  0.84252876,  0.61145987,  0.33884677,
        0.18018272,  0.07712709,  0.11786013,  0.10030791,  0.95774354,
        0.15051257,  0.23583899,  0.14650228,  0.67399674,  0.53592921,
        0.73847504,  0.69036797,  0.32131543,  0.48276859,  0.32

In [34]:
predictions = pd.DataFrame({'PassengerId':test_2['PassengerId'],
                            'Survived':y_Survived_preds}, columns=['PassengerId','Survived'])
predictions['Survived'] = predictions['Survived'].round(0).astype(int)

In [35]:
predictions

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [36]:
#Exporting prediction file
predictions.to_csv("predictions.csv", index=False)