## Experiment #1 - Decision Tree Classifier

In [693]:
%matplotlib inline
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [694]:
data = pd.read_csv(r'C:\Users\apaiu\GitHub Repositories\Tekwill-Machine-Learning-Course\Data Sources\titanic_kaggle_train.csv')

In [695]:
test_data = pd.read_csv(r'C:\Users\apaiu\GitHub Repositories\Tekwill-Machine-Learning-Course\Data Sources\titanic_kaggle_test.csv')

In [696]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S
1,737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48.0,1,3,W./C. 6608,34.375,,S
2,370,1,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.3,B35,C
3,133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47.0,1,0,A/5. 3337,14.5,,S
4,415,1,3,"Sundman, Mr. Johan Julian",male,44.0,0,0,STON/O 2. 3101269,7.925,,S


In [697]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  623 non-null    int64  
 1   Survived     623 non-null    int64  
 2   Pclass       623 non-null    int64  
 3   Name         623 non-null    object 
 4   Sex          623 non-null    object 
 5   Age          501 non-null    float64
 6   SibSp        623 non-null    int64  
 7   Parch        623 non-null    int64  
 8   Ticket       623 non-null    object 
 9   Fare         623 non-null    float64
 10  Cabin        135 non-null    object 
 11  Embarked     622 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 58.5+ KB


In [698]:
#Data checks for Train Data
#Checking if there is any null values within the dataframe - THESE COLUMNS HAVE NULLS ['Age', 'Cabin', 'Embarked']
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            122
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          488
Embarked         1
dtype: int64

In [699]:
#Filling NULL values with a constant (Unknown)
data['Age'] = data['Age'].fillna(0)
data['Cabin'] = data['Cabin'].fillna('Unknown')
data['Embarked'] = data['Embarked'].fillna('Unknown')

In [700]:
#No null values present anymore after corrections
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [701]:
#Checking if there are any duplicated rows - NO DUPLICATES IDENTIFIED
data[data.duplicated()].sum()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

In [702]:
#Transforming ['Sex'] categorical column into a numeric one
dictt = {"female": 0, "male": 1}
data['Sex'] = data['Sex'].map(dictt)

In [703]:
data['Sex'].value_counts()

1    402
0    221
Name: Sex, dtype: int64

In [704]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,707,1,2,"Kelly, Mrs. Florence ""Fannie""",0,45.0,0,0,223596,13.5000,Unknown,S
1,737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",0,48.0,1,3,W./C. 6608,34.3750,Unknown,S
2,370,1,1,"Aubart, Mme. Leontine Pauline",0,24.0,0,0,PC 17477,69.3000,B35,C
3,133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",0,47.0,1,0,A/5. 3337,14.5000,Unknown,S
4,415,1,3,"Sundman, Mr. Johan Julian",1,44.0,0,0,STON/O 2. 3101269,7.9250,Unknown,S
...,...,...,...,...,...,...,...,...,...,...,...,...
618,523,0,3,"Lahoud, Mr. Sarkis",1,0.0,0,0,2624,7.2250,Unknown,C
619,666,0,2,"Hickman, Mr. Lewis",1,32.0,2,0,S.O.C. 14879,73.5000,Unknown,S
620,272,1,3,"Tornquist, Mr. William Henry",1,25.0,0,0,LINE,0.0000,Unknown,S
621,739,0,3,"Ivanoff, Mr. Kanio",1,0.0,0,0,349201,7.8958,Unknown,S


In [705]:
#Transforming [Cabin] & [Embarked] feature into numeric ones:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data['Cabin'])
# le.classes_
data['Cabin'] = le.transform(data['Cabin']) 

In [706]:
le.fit(data['Embarked'])
data['Embarked'] = le.transform(data['Embarked']) 

In [707]:
data['Embarked'].value_counts(), data['Cabin'].value_counts()

(2    448
 0    121
 1     53
 3      1
 Name: Embarked, dtype: int64,
 106    488
 37       4
 51       3
 100      3
 69       2
       ... 
 53       1
 24       1
 40       1
 52       1
 38       1
 Name: Cabin, Length: 107, dtype: int64)

In [708]:
#Collecting X and Y
y = data['Survived'].values
X = data.drop(['Survived', 'Name', 'Ticket'], axis=1).values #Eliminated [Name] and [Ticket] columns due to irrelevancy

In [709]:
X, y

(array([[707.    ,   2.    ,   0.    , ...,  13.5   , 106.    ,   2.    ],
        [737.    ,   3.    ,   0.    , ...,  34.375 , 106.    ,   2.    ],
        [370.    ,   1.    ,   0.    , ...,  69.3   ,  18.    ,   0.    ],
        ...,
        [272.    ,   3.    ,   1.    , ...,   0.    , 106.    ,   2.    ],
        [739.    ,   3.    ,   1.    , ...,   7.8958, 106.    ,   2.    ],
        [572.    ,   1.    ,   0.    , ...,  51.4792,  38.    ,   2.    ]]),
 array([1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
        0, 0, 1, 

In [710]:
#TO DO: Stardardizarea, normalizarea, data pre-processing.... 

In [711]:
# Experiment #1 - Decission Tree Classifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier(max_depth=5, random_state=0)

#Training data from train.csv
X_train = X
y_train = y

model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print("Accuracy in train set: ", accuracy_score(y_train_pred, y_train))

Accuracy in train set:  0.8619582664526485


In [712]:
X_test, y_test_pred

(array([[0.95936795, 0.        , 0.        , ..., 0.07690368, 0.69811321,
         0.66666667],
        [0.751693  , 0.        , 0.        , ..., 0.10149724, 0.44339623,
         0.66666667],
        [0.90180587, 0.        , 1.        , ..., 0.2342244 , 0.3490566 ,
         0.66666667],
        ...,
        [0.22911964, 1.        , 1.        , ..., 0.03093714, 1.        ,
         0.66666667],
        [0.77539503, 0.        , 1.        , ..., 0.11125659, 0.13207547,
         0.66666667],
        [0.26410835, 0.5       , 0.        , ..., 0.05123659, 1.        ,
         0.66666667]]),
 array([1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1.,
        1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 0., 0., 1.,
        0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0.,
        0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 0., 1., 0., 0

In [517]:
# Experiment #2 ....
# Experiment #3 ....


## Working with TEST data set

In [713]:
#Data checks
#Checking if there is any null values within the dataframe - THESE COLUMNS HAVE NULLS ['Age', 'Cabin', 'Embarked']
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  268 non-null    int64  
 1   Pclass       268 non-null    int64  
 2   Name         268 non-null    object 
 3   Sex          268 non-null    object 
 4   Age          213 non-null    float64
 5   SibSp        268 non-null    int64  
 6   Parch        268 non-null    int64  
 7   Ticket       268 non-null    object 
 8   Fare         268 non-null    float64
 9   Cabin        69 non-null     object 
 10  Embarked     267 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 23.2+ KB


In [714]:
#Checking if there is any null values within the dataframe - THESE COLUMNS HAVE NULLS ['Age', 'Cabin', 'Embarked']
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             55
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          199
Embarked         1
dtype: int64

In [715]:
#Filling NULL values with a constant (Unknown)
test_data['Age'] = test_data['Age'].fillna(0)
test_data['Cabin'] = test_data['Cabin'].fillna('Unknown')
test_data['Embarked'] = test_data['Embarked'].fillna('Unknown')

In [716]:
#No null values present anymore after corrections
test_data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [717]:
#Checking if there are any duplicated rows - NO DUPLICATES IDENTIFIED
test_data[test_data.duplicated()].sum()

PassengerId    0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

In [718]:
#Transforming ['Sex'] categorical column into a numeric one
dictt = {"female": 0, "male": 1}
test_data['Sex'] = test_data['Sex'].map(dictt)

In [719]:
test_data['Sex'].value_counts()

1    175
0     93
Name: Sex, dtype: int64

In [720]:
#Transforming [Cabin] & [Embarked] feature into numeric ones:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(test_data['Cabin'])
# le.classes_
test_data['Cabin'] = le.transform(test_data['Cabin']) 

In [721]:
le.fit(test_data['Embarked'])
test_data['Embarked'] = le.transform(test_data['Embarked']) 

In [722]:
test_data['Embarked'].value_counts(), test_data['Cabin'].value_counts()

(2    196
 0     47
 1     24
 3      1
 Name: Embarked, dtype: int64,
 63    199
 51      2
 62      2
 37      2
 42      2
      ... 
 17      1
 1       1
 43      1
 6       1
 55      1
 Name: Cabin, Length: 64, dtype: int64)

In [723]:
#Collecting X and Y

#Creating target column with Null value and assign it to y variable
test_data['Survived'] = np.NaN

y_test = test_data['Survived']
X_test = test_data.drop(['Survived', 'Name', 'Ticket'], axis=1).values #Eliminated [Name] and [Ticket] columns due to irrelevancy

In [529]:
X_test, y_test

(array([[626.    ,   1.    ,   1.    , ...,  32.3208,  47.    ,   2.    ],
        [208.    ,   3.    ,   1.    , ...,  18.7875,  63.    ,   0.    ],
        [700.    ,   3.    ,   1.    , ...,   7.65  ,  58.    ,   2.    ],
        ...,
        [ 92.    ,   3.    ,   1.    , ...,   7.8542,  63.    ,   2.    ],
        [559.    ,   1.    ,   0.    , ...,  79.65  ,  55.    ,   2.    ],
        [845.    ,   3.    ,   1.    , ...,   8.6625,  63.    ,   2.    ]]),
 0     NaN
 1     NaN
 2     NaN
 3     NaN
 4     NaN
        ..
 263   NaN
 264   NaN
 265   NaN
 266   NaN
 267   NaN
 Name: Survived, Length: 268, dtype: float64)

In [724]:
#Predicting Results on #Model from Experiment #1:

y_test_pred = model.predict(X_test)

In [725]:
X_test, y_test_pred

(array([[626.    ,   1.    ,   1.    , ...,  32.3208,  47.    ,   2.    ],
        [208.    ,   3.    ,   1.    , ...,  18.7875,  63.    ,   0.    ],
        [700.    ,   3.    ,   1.    , ...,   7.65  ,  58.    ,   2.    ],
        ...,
        [ 92.    ,   3.    ,   1.    , ...,   7.8542,  63.    ,   2.    ],
        [559.    ,   1.    ,   0.    , ...,  79.65  ,  55.    ,   2.    ],
        [845.    ,   3.    ,   1.    , ...,   8.6625,  63.    ,   2.    ]]),
 array([0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
        1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
        0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
        0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
        1, 1, 0, 

In [726]:
#Creating DataFrame as per expected solution for Kaggle

#Reshape predicted values
y_test_pred_resh = y_test_pred.reshape(y_test_pred.shape[0], -1)

In [727]:
#Merge X_test PassengerID & predicted value 
merged_info = np.concatenate((X_test[:,[0]], y_test_pred_resh), axis=1)

In [728]:
#Create DataFrame for merged arrays (merged_info)
final = pd.DataFrame(data = merged_info, columns = ['PassengerId', 'Survived'])

In [729]:
#Converting all columns from float to int
final = final.astype(int)

In [730]:
final

Unnamed: 0,PassengerId,Survived
0,626,0
1,208,0
2,700,0
3,622,1
4,76,0
...,...,...
263,235,0
264,563,0
265,92,0
266,559,1


In [731]:
#Exporting the dataframe to a csv file
final.to_csv('DecisionTreeClassifier_EXP_01.csv', index=False)

## **To do:**
- De vazut cum se mai poate de facut data pre-processing:

     - De facut dictionar cu 0, 1 pentru male/female, etc.
     - De vazut cum se poate de facut standardizarea
     - Normalizarea
     - Sklearn pentru data pre-processing features categorice
     
    
- De incercat diferiti algoritmi cu datele cum sunt dupa pre-processarea curenta
- De incercat diferiti algoritmi cu datele cum sunt dupa pre-processarea mai buna + standardizare + normalizare, etc.

## Experiment #2 - Data Normalization + Decision Tree Classifier

In [732]:
#Data Normalization
mm_scaler = MinMaxScaler()
X = mm_scaler.fit_transform(X)
y = mm_scaler.fit_transform(y.reshape(-1,1))

In [733]:
#Data after normalization
X, y

(array([[0.79345372, 0.5       , 0.        , ..., 0.02635025, 1.        ,
         0.66666667],
        [0.82731377, 1.        , 0.        , ..., 0.06709553, 1.        ,
         0.66666667],
        [0.41309255, 0.        , 0.        , ..., 0.13526459, 0.16981132,
         0.        ],
        ...,
        [0.30248307, 1.        , 1.        , ..., 0.        , 1.        ,
         0.66666667],
        [0.82957111, 1.        , 1.        , ..., 0.01541158, 1.        ,
         0.66666667],
        [0.64108352, 0.        , 0.        , ..., 0.10048071, 0.35849057,
         0.66666667]]),
 array([[1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
   

In [736]:
# Experiment #2 - Decission Tree Classifier
#Changed Max_Depth = 10

model2 = DecisionTreeClassifier(max_depth=10, random_state=0)

#Training data from train.csv
X_train = X
y_train = y

model2.fit(X_train, y_train)
y_train_pred = model2.predict(X_train)
print("Accuracy in train set: ", accuracy_score(y_train_pred, y_train))

Accuracy in train set:  0.9582664526484751


In [737]:
#Predicting Results on #Model from Experiment #2:

y_test_pred = model2.predict(X_test)

In [738]:
y_test_pred_resh = y_test_pred.reshape(y_test_pred.shape[0], -1)
merged_info = np.concatenate((X_test[:,[0]], y_test_pred_resh), axis=1)
final = pd.DataFrame(data = merged_info, columns = ['PassengerId', 'Survived'])
final = final.astype(int)

In [739]:
#Exporting the dataframe to a csv file
final.to_csv('DecisionTreeClassifier_EXP_02.csv', index=False)

In [661]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

Instantiaza modelul LinearRegression din modulul sklearn.linear_model. Cheama functia .fit() cu parametrii de intrare respectivi pentru a antrena modelul.

In [66]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

Verifica scorul prezicerilor tale. Incearca sa chemi functia *mean_squared_error* din modulul *sklearn.metrics* pentru a vedea eroarea dintre prezicerile tale si valoarea adevarata a targetului. Fa asta atat pentru setul de antrenare cat si cel de test.

In [70]:
from sklearn.metrics import mean_squared_error

train_score = regr.score(X_train, y_train)
print("The training score of model is: ", train_score)

test_score = regr.score(X_test, y_test)
print("The score of the model on test data is:", test_score )

The training score of model is:  0.347992619352986
The score of the model on test data is: 0.40318034127962166


In [74]:
y_pred_train = regr.predict(X_train)
mean_squared_error(y_train, y_pred_train)

0.01696764629255962

In [75]:
y_pred_test = regr.predict(X_test)
mean_squared_error(y_test, y_pred_test)

0.015601005758558212