In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, mean_absolute_error, mean_squared_error, r2_score
)

In [10]:
df = pd.read_csv('UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [11]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

ID                            0
LIMIT_BAL                     0
SEX                           0
EDUCATION                     0
MARRIAGE                      0
AGE                           0
PAY_0                         0
PAY_2                         0
PAY_3                         0
PAY_4                         0
PAY_5                         0
PAY_6                         0
BILL_AMT1                     0
BILL_AMT2                     0
BILL_AMT3                     0
BILL_AMT4                     0
BILL_AMT5                     0
BILL_AMT6                     0
PAY_AMT1                      0
PAY_AMT2                      0
PAY_AMT3                      0
PAY_AMT4                      0
PAY_AMT5                      0
PAY_AMT6                      0
default.payment.next.month    0
dtype: int64

In [12]:
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.3,
random_state=42,
stratify=y
)

In [14]:
clf = DecisionTreeClassifier(
random_state=42
)

clf.fit(X_train, y_train)

In [15]:
y_pred = clf.predict(X_test)

In [16]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.7241111111111111

In [17]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

precision, recall, f1

(0.3856877323420074, 0.41687594173782017, 0.40067583876418056)

In [24]:
confusion_matrix(y_test, y_pred)

array([[5687, 1322],
       [1161,  830]])

In [32]:
df_air = pd.read_csv('AirQuality.csv', sep=';', decimal=',')

In [33]:
df_air = df_air.dropna(axis=1, how='all')

In [34]:
df_air.replace(-200, np.nan, inplace=True)
df_air.dropna(inplace=True)

In [35]:
df_air.columns

Index(['Date', 'Time', 'CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)',
       'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)',
       'PT08.S5(O3)', 'T', 'RH', 'AH'],
      dtype='object')

In [36]:
X = df_air.drop(['CO(GT)', 'Date', 'Time'], axis=1)
y = df_air['CO(GT)']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.3,
random_state=42
)

In [38]:
reg = DecisionTreeRegressor(
random_state=42
)

reg.fit(X_train, y_train)

In [39]:
y_pred = reg.predict(X_test)

In [40]:
mae = mean_absolute_error(y_test, y_pred)
mae

0.24618473895582332

In [41]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

mse, rmse

(0.12983935742971886, 0.3603322875204481)

In [42]:
r2 = r2_score(y_test, y_pred)
r2

0.9321201735932984

In [50]:
df = pd.read_csv('UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [51]:
X = df.drop(['ID', 'default.payment.next.month'], axis=1)
y = df['default.payment.next.month']

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.3,
random_state=42,
stratify=y
)

In [53]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_leaf': [1, 5, 10, 20],
'min_samples_split': [2, 10, 20]
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(
clf,
param_grid,
cv=5,
scoring='f1',
n_jobs=-1
)

grid_search.fit(X_train, y_train)

In [54]:
grid_search.best_params_

{'max_depth': 3, 'min_samples_leaf': 20, 'min_samples_split': 2}

In [55]:
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = best_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.8175555555555556,
 0.6623255813953488,
 0.35760924158714213,
 0.4644487932159165)

In [61]:
df_air = pd.read_csv('AirQuality.csv', sep=';', decimal=',')

In [62]:
df_air = df_air.dropna(axis=1, how='all')

In [63]:
df_air.replace(-200, np.nan, inplace=True)
df_air.dropna(inplace=True)

In [64]:
X = df_air.drop(['CO(GT)', 'Date', 'Time'], axis=1)
y = df_air['CO(GT)']

In [65]:
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.3,
random_state=42
)

In [66]:
from sklearn.tree import DecisionTreeRegressor

param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_leaf': [1, 5, 10, 20]
}

reg = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(
reg,
param_grid,
cv=5,
scoring='neg_mean_absolute_error',
n_jobs=-1
)

grid_search.fit(X_train, y_train)

  pid = os.fork()


In [67]:
best_reg = grid_search.best_estimator_
best_reg.fit(X_train, y_train)

In [68]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = best_reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

mae, rmse, r2

(0.22820814118284186, 0.32408981270481774, 0.9450882701621811)

In [69]:
df = pd.read_csv('UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [70]:
X = df.drop(['ID', 'default.payment.next.month'], axis=1)
y = df['default.payment.next.month']

In [71]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.3,
random_state=42,
stratify=y
)

In [72]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_leaf': [1, 5, 10, 20],
'min_samples_split': [2, 10, 20]
}

clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(
clf,
param_grid=param_grid,
cv=5,
scoring='f1',
n_jobs=-1
)

grid_search.fit(X_train, y_train)

  pid = os.fork()


In [73]:
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

In [74]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred = best_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

accuracy, precision, recall, f1

(0.8175555555555556,
 0.6623255813953488,
 0.35760924158714213,
 0.4644487932159165)

In [81]:
df_air = pd.read_csv('AirQuality.csv', sep=';', decimal=',')


In [82]:
df_air = df_air.dropna(axis=1, how='all')

In [83]:
df_air.replace(-200, np.nan, inplace=True)
df_air.dropna(inplace=True)

In [84]:
X = df_air.drop(['CO(GT)', 'Date', 'Time'], axis=1)
y = df_air['CO(GT)']

In [85]:
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=0.3,
random_state=42
)

In [86]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
'max_depth': [3, 5, 7, 10, None],
'min_samples_leaf': [1, 5, 10, 20]
}

reg = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(
reg,
param_grid=param_grid,
cv=5,
scoring='neg_mean_absolute_error',
n_jobs=-1
)

grid_search.fit(X_train, y_train)

  pid = os.fork()


In [87]:
best_reg = grid_search.best_estimator_
best_reg.fit(X_train, y_train)

In [88]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = best_reg.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

mae, rmse, r2

(0.22820814118284186, 0.32408981270481774, 0.9450882701621811)