In [46]:
%matplotlib inline
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler 
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (20.0, 10.0)

In [316]:
#Importing TRAIN dataset:
!mkdir data && curl -o data/titanic_kaggle.csv -LJO https://github.com/AlinaPaiu/Tekwill-Machine-Learning-Course/raw/main/Data%20Sources/titanic_kaggle.csv

A subdirectory or file data already exists.


In [296]:
data = pd.read_csv('data/titanic_kaggle.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S
1,737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48.0,1,3,W./C. 6608,34.375,,S
2,370,1,1,"Aubart, Mme. Leontine Pauline",female,24.0,0,0,PC 17477,69.3,B35,C
3,133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47.0,1,0,A/5. 3337,14.5,,S
4,415,1,3,"Sundman, Mr. Johan Julian",male,44.0,0,0,STON/O 2. 3101269,7.925,,S


In [297]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 623 entries, 0 to 622
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  623 non-null    int64  
 1   Survived     623 non-null    int64  
 2   Pclass       623 non-null    int64  
 3   Name         623 non-null    object 
 4   Sex          623 non-null    object 
 5   Age          501 non-null    float64
 6   SibSp        623 non-null    int64  
 7   Parch        623 non-null    int64  
 8   Ticket       623 non-null    object 
 9   Fare         623 non-null    float64
 10  Cabin        135 non-null    object 
 11  Embarked     622 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 58.5+ KB


In [298]:
#Data checks
#Checking if there is any null values within the dataframe - THESE COLUMNS HAVE NULLS ['Age', 'Cabin', 'Embarked']
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            122
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          488
Embarked         1
dtype: int64

In [299]:
#Filling NULL values with a constant (Unknown)
data['Age'] = data['Age'].fillna(0)
data['Cabin'] = data['Cabin'].fillna('Unknown')
data['Embarked'] = data['Embarked'].fillna('Unknown')

In [300]:
#No null values present anymore after corrections
data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [301]:
#Checking if there are any duplicated rows - NO DUPLICATES IDENTIFIED
data[data.duplicated()].sum()

PassengerId    0.0
Survived       0.0
Pclass         0.0
Name           0.0
Sex            0.0
Age            0.0
SibSp          0.0
Parch          0.0
Ticket         0.0
Fare           0.0
Cabin          0.0
Embarked       0.0
dtype: float64

In [302]:
#Transforming ['Sex'] categorical column into a numeric one
dictt = {"female": 0, "male": 1}
data['Sex'] = data['Sex'].map(dictt)

In [303]:
data['Sex'].value_counts()

1    402
0    221
Name: Sex, dtype: int64

In [304]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,707,1,2,"Kelly, Mrs. Florence ""Fannie""",0,45.0,0,0,223596,13.5000,Unknown,S
1,737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",0,48.0,1,3,W./C. 6608,34.3750,Unknown,S
2,370,1,1,"Aubart, Mme. Leontine Pauline",0,24.0,0,0,PC 17477,69.3000,B35,C
3,133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",0,47.0,1,0,A/5. 3337,14.5000,Unknown,S
4,415,1,3,"Sundman, Mr. Johan Julian",1,44.0,0,0,STON/O 2. 3101269,7.9250,Unknown,S
...,...,...,...,...,...,...,...,...,...,...,...,...
618,523,0,3,"Lahoud, Mr. Sarkis",1,0.0,0,0,2624,7.2250,Unknown,C
619,666,0,2,"Hickman, Mr. Lewis",1,32.0,2,0,S.O.C. 14879,73.5000,Unknown,S
620,272,1,3,"Tornquist, Mr. William Henry",1,25.0,0,0,LINE,0.0000,Unknown,S
621,739,0,3,"Ivanoff, Mr. Kanio",1,0.0,0,0,349201,7.8958,Unknown,S


In [305]:
#Transforming [Cabin] & [Embarked] feature into numeric ones:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(data['Cabin'])
# le.classes_
data['Cabin'] = le.transform(data['Cabin']) 

In [306]:
le.fit(data['Embarked'])
data['Embarked'] = le.transform(data['Embarked']) 

In [307]:
data['Embarked'].value_counts(), data['Cabin'].value_counts()

(2    448
 0    121
 1     53
 3      1
 Name: Embarked, dtype: int64,
 106    488
 37       4
 51       3
 100      3
 69       2
       ... 
 53       1
 24       1
 40       1
 52       1
 38       1
 Name: Cabin, Length: 107, dtype: int64)

In [308]:
#Collecting X and Y
y = data['Survived'].values
X = data.drop(['Survived', 'Name', 'Ticket'], axis=1).values #Eliminated [Name] and [Ticket] columns due to irrelevancy

In [295]:
X, y

(array([[707.    ,   2.    ,   0.    , ...,  13.5   , 106.    ,   2.    ],
        [737.    ,   3.    ,   0.    , ...,  34.375 , 106.    ,   2.    ],
        [370.    ,   1.    ,   0.    , ...,  69.3   ,  18.    ,   0.    ],
        ...,
        [272.    ,   3.    ,   1.    , ...,   0.    , 106.    ,   2.    ],
        [739.    ,   3.    ,   1.    , ...,   7.8958, 106.    ,   2.    ],
        [572.    ,   1.    ,   0.    , ...,  51.4792,  38.    ,   2.    ]]),
 array([1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1,
        1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
        1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
        1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
        0, 0, 1, 

In [137]:
#TO DO: Stardardizarea, normalizarea, data pre-processing.... 

In [315]:
# Experiment #1 - Decission Tree Classifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
model = DecisionTreeClassifier(max_depth=5, random_state=0)

#Training data from train.csv
X_train = X
y_train = y

model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
print("Accuracy in train set: ", accuracy_score(y_train_pred, y_train))

Accuracy in train set:  0.8619582664526485


In [None]:
#Importing TEST dataset:


**To do:**
- De vazut cum se mai poate de facut data pre-processing:

     - De facut dictionar cu 0, 1 pentru male/female, etc.
     - De vazut cum se poate de facut standardizarea
     - Normalizarea
     - Sklearn pentru data pre-processing features categorice
     
    
- De incercat diferiti algoritmi cu datele cum sunt dupa pre-processarea curenta
- De incercat diferiti algoritmi cu datele cum sunt dupa pre-processarea mai buna + standardizare + normalizare, etc.

In [None]:
-----------------------------------------------------------------------------------------------------------------------

In [55]:
# Collecting X and Y
y = data['quality'].values
X = data.drop(['quality'], axis=1).values

In [56]:
X, y

(array([[ 7.4  ,  0.7  ,  0.   , ...,  3.51 ,  0.56 ,  9.4  ],
        [ 7.8  ,  0.88 ,  0.   , ...,  3.2  ,  0.68 ,  9.8  ],
        [ 7.8  ,  0.76 ,  0.04 , ...,  3.26 ,  0.65 ,  9.8  ],
        ...,
        [ 6.3  ,  0.51 ,  0.13 , ...,  3.42 ,  0.75 , 11.   ],
        [ 5.9  ,  0.645,  0.12 , ...,  3.57 ,  0.71 , 10.2  ],
        [ 6.   ,  0.31 ,  0.47 , ...,  3.39 ,  0.66 , 11.   ]]),
 array([5, 5, 5, ..., 6, 5, 6]))

In urmatoarele celule faceti o analiza a setului de date. Vezi daca exista valori NaN, si inlocuieste-le cu media pe coloana. Incearca sa Standardizezi (sau sa Normalizezi) datele.

In [57]:
#Checking for null values (NaN) - no replacement with mean() is required
data.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [59]:
#Data Normalization
mm_scaler = MinMaxScaler()
X = mm_scaler.fit_transform(X)
y = mm_scaler.fit_transform(y.reshape(-1,1))

In [60]:
#Data after normalization
X, y

(array([[0.24778761, 0.39726027, 0.        , ..., 0.60629921, 0.13772455,
         0.15384615],
        [0.28318584, 0.52054795, 0.        , ..., 0.36220472, 0.20958084,
         0.21538462],
        [0.28318584, 0.43835616, 0.04      , ..., 0.40944882, 0.19161677,
         0.21538462],
        ...,
        [0.15044248, 0.26712329, 0.13      , ..., 0.53543307, 0.25149701,
         0.4       ],
        [0.11504425, 0.35958904, 0.12      , ..., 0.65354331, 0.22754491,
         0.27692308],
        [0.12389381, 0.13013699, 0.47      , ..., 0.51181102, 0.19760479,
         0.4       ]]), array([[0.4],
        [0.4],
        [0.4],
        ...,
        [0.6],
        [0.4],
        [0.6]]))

Imparte setul de date in set de date in: date de antrenare, date pentru test, cu porportia de 0.2 pentru test.

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

Instantiaza modelul LinearRegression din modulul sklearn.linear_model. Cheama functia .fit() cu parametrii de intrare respectivi pentru a antrena modelul.

In [66]:
from sklearn.linear_model import LinearRegression

regr = LinearRegression()
regr.fit(X_train, y_train)

LinearRegression()

Verifica scorul prezicerilor tale. Incearca sa chemi functia *mean_squared_error* din modulul *sklearn.metrics* pentru a vedea eroarea dintre prezicerile tale si valoarea adevarata a targetului. Fa asta atat pentru setul de antrenare cat si cel de test.

In [70]:
from sklearn.metrics import mean_squared_error

train_score = regr.score(X_train, y_train)
print("The training score of model is: ", train_score)

test_score = regr.score(X_test, y_test)
print("The score of the model on test data is:", test_score )

The training score of model is:  0.347992619352986
The score of the model on test data is: 0.40318034127962166


In [74]:
y_pred_train = regr.predict(X_train)
mean_squared_error(y_train, y_pred_train)

0.01696764629255962

In [75]:
y_pred_test = regr.predict(X_test)
mean_squared_error(y_test, y_pred_test)

0.015601005758558212