In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import pandas as pd 

In [2]:
data = pd.read_csv('data_processed.csv')

In [3]:
data.head()

Unnamed: 0,absences_left,age_left,absences,age,goout_1,goout_2,goout_3,goout_4,goout_5,failures_0,failures_1,failures_2,failures_3,approved
0,6,18,6,18,0,0,0,1,0,1,0,0,0,0
1,4,17,4,17,0,0,1,0,0,1,0,0,0,0
2,10,15,10,15,0,1,0,0,0,0,0,0,1,1
3,2,15,2,15,0,1,0,0,0,1,0,0,0,1
4,4,16,4,16,0,1,0,0,0,1,0,0,0,1


In [4]:
x = data.iloc[:, 2:-1]
y = data.iloc[:, -1]

In [5]:
logistic_regression = LogisticRegression()
results = cross_val_score(logistic_regression, x , y , cv = 10)

minimum_value = results.mean() - 2 * results.std()
minimum_value = (minimum_value * 100).round(2)

 

maximum_value = results.mean() + 2 * results.std()
maximum_value = (maximum_value * 100).round(2)

print(f'So with Logist Regression(with out tuning) i expect the accuracy the will be between: [{minimum_value}%,{maximum_value}%]')
logistic_regression.fit(x, y)
y_pred = logistic_regression.predict(x)
accuracy_real = accuracy_score(y, y_pred) * 100
print(f'But the real accuracy is(with out tuning): {accuracy_real.round(2)}%')

So with Logist Regression(with out tuning) i expect the accuracy the will be between: [58.16%,83.57%]
But the real accuracy is(with out tuning): 72.15%


In [18]:
params = {
    'logist_regression__tol': [1e-4,1e-5,1e-3],
    'logist_regression__max_iter': [100,150,200,1000],
   'logist_regression__multi_class': ['ovr','multinomial','auto'], 
}

logistic_regression = LogisticRegression()
pipeline_lr = Pipeline([('scaler',  StandardScaler()), ('logist_regression', LogisticRegression())])

results = GridSearchCV(pipeline_lr, params, cv = 5, return_train_score=True, verbose=0)
results.fit(x,y)
logistic_regression = results.best_estimator_

results = pd.DataFrame(results.cv_results_)
results_sorted = results.sort_values('mean_test_score', ascending= False)
for index, row in results_sorted.head(1).iterrows():
  print("So with Logist Regression(with tuning) i expect the accuracy the will be between:[%.2f%% , %.2f%%]" % ((row.mean_test_score - row.std_test_score * 2) * 100, (row.mean_test_score + row.std_test_score * 2) * 100))

logistic_regression.fit(x,y)
y_pred = logistic_regression.predict(x)
accuracy_real = accuracy_score(y, y_pred) * 100
print(f'But the real accuracy is(with tuning): {accuracy_real.round(2)}%')

So with Logist Regression(with tuning) i expect the accuracy the will be between:[63.73% , 78.55%]
But the real accuracy is(with tuning): 72.41%


In [19]:
knn = KNeighborsClassifier()
results = cross_val_score(knn, x , y , cv = 10)

minimum_value = results.mean() - 2 * results.std()
minimum_value = (minimum_value * 100).round(2)
results = cross_val_score(knn, x , y , cv = 10)

 

maximum_value = results.mean() + 2 * results.std()
maximum_value = (maximum_value * 100).round(2)

print(f'So with Knn(with out tuning) i expect the accuracy the will be between: [{minimum_value}%,{maximum_value}%]')
knn.fit(x, y)
y_pred = knn.predict(x)
accuracy_real = accuracy_score(y, y_pred) * 100
print(f'But the real accuracy is(with out tuning): {accuracy_real.round(2)}%')

So with Knn(with out tuning) i expect the accuracy the will be between: [57.02%,78.68%]
But the real accuracy is(with out tuning): 78.48%


In [23]:
params = {
    'knnClassifier__n_neighbors': range(5,35),
    'knnClassifier__weights': ['uniform','distance'],
    'knnClassifier__leaf_size': range(30,50)
}

knn = KNeighborsClassifier()
pipeline_knn = Pipeline([('scaler',  StandardScaler()), ('knnClassifier', KNeighborsClassifier())])

results = GridSearchCV(pipeline_knn, params, cv = 10, return_train_score=True, verbose=0)
results.fit(x,y)
knn = results.best_estimator_

results = pd.DataFrame(results.cv_results_)
results_sorted = results.sort_values('mean_test_score', ascending= False)
for index, row in results_sorted.head(1).iterrows():
  print("So with Logist Regression(with tuning) i expect the accuracy the will be between:[%.2f%% , %.2f%%]" % ((row.mean_test_score - row.std_test_score * 2) * 100, (row.mean_test_score + row.std_test_score * 2) * 100))

knn.fit(x,y)
y_pred = knn.predict(x)
accuracy_real = accuracy_score(y, y_pred) * 100
print(f'But the real accuracy is(with tuning): {accuracy_real.round(2)}%')

So with Logist Regression(with tuning) i expect the accuracy the will be between:[64.35% , 80.41%]
But the real accuracy is(with tuning): 75.44%


In [24]:
forest = RandomForestClassifier()
results = cross_val_score(forest, x , y , cv = 10)
minimum_value = results.mean() - 2 * results.std()
minimum_value = (minimum_value * 100).round(2)

 

maximum_value = results.mean() + 2 * results.std()
maximum_value = (maximum_value * 100).round(2)

print(f'So with Random Forest(with out tuning) i expect the accuracy the will be between: [{minimum_value}%,{maximum_value}%]')

forest.fit(x, y)
y_pred = forest.predict(x)
accuracy_real = accuracy_score(y, y_pred) * 100
print(f'But the real accuracy is: {accuracy_real.round(2)}%')

So with Random Forest(with out tuning) i expect the accuracy the will be between: [53.25%,83.32%]
But the real accuracy is: 89.62%


In [25]:
params = {
    'random_forest__n_estimators': [50, 70, 90,100,150,200,250],
    'random_forest__criterion': ['gini','entropy'],
    'random_forest__max_depth': [30,70,80,90, None],
    'random_forest__min_samples_leaf': [1,2,4,6,8],
    'random_forest__max_leaf_nodes' : [10,20,30, None]
}
pipeline_random_forest = Pipeline([('scaler',  StandardScaler()), ('random_forest', RandomForestClassifier())])

forest = RandomForestClassifier()
results = GridSearchCV(pipeline_random_forest, params, cv = 10)
results.fit(x,y)
forest = results.best_estimator_
results = pd.DataFrame(results.cv_results_)
results_sorted = results.sort_values('mean_test_score', ascending= False)
for index, row in results_sorted.head(1).iterrows():
  print("So with random forest(with tuning) i expect the accuracy the will be between:[%.2f%%, %.2f%%]" % ((row.mean_test_score - row.std_test_score * 2) * 100, (row.mean_test_score + row.std_test_score * 2)* 100))

forest.fit(x,y)
y_pred = forest.predict(x)
accuracy_real = accuracy_score(y, y_pred) * 100
print(f'But the real accuracy is(with tuning): {accuracy_real.round(2)}%')

So with random forest(with tuning) i expect the accuracy the will be between:[65.06%, 83.28%]
But the real accuracy is(with tuning): 79.49%


In [11]:
model = CatBoostClassifier()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0)
model.fit(x_train, y_train)

4	total: 1.09s	remaining: 683ms
616:	learn: 0.4257508	total: 1.1s	remaining: 681ms
617:	learn: 0.4256383	total: 1.1s	remaining: 679ms
618:	learn: 0.4252162	total: 1.1s	remaining: 677ms
619:	learn: 0.4251086	total: 1.1s	remaining: 675ms
620:	learn: 0.4250703	total: 1.1s	remaining: 672ms
621:	learn: 0.4249762	total: 1.1s	remaining: 670ms
622:	learn: 0.4246661	total: 1.1s	remaining: 668ms
623:	learn: 0.4242352	total: 1.11s	remaining: 667ms
624:	learn: 0.4240702	total: 1.11s	remaining: 664ms
625:	learn: 0.4240294	total: 1.12s	remaining: 668ms
626:	learn: 0.4239959	total: 1.12s	remaining: 666ms
627:	learn: 0.4239851	total: 1.12s	remaining: 664ms
628:	learn: 0.4238054	total: 1.12s	remaining: 662ms
629:	learn: 0.4237818	total: 1.12s	remaining: 660ms
630:	learn: 0.4237589	total: 1.12s	remaining: 658ms
631:	learn: 0.4236670	total: 1.13s	remaining: 656ms
632:	learn: 0.4235999	total: 1.13s	remaining: 654ms
633:	learn: 0.4234884	total: 1.13s	remaining: 652ms
634:	learn: 0.4234237	total: 1.13s	rema

<catboost.core.CatBoostClassifier at 0x7f1f877b95d0>

In [12]:
accuracy = model.score(x_test, y_test) * 100
print(f'with Cat boost the accuracy was(with split): {accuracy.round(2)}%')

with Cat boost the accuracy was(with split): 73.74%


In [13]:
model.fit(x, y)

1s	remaining: 689ms
616:	learn: 0.4391864	total: 1.1s	remaining: 686ms
617:	learn: 0.4390909	total: 1.11s	remaining: 684ms
618:	learn: 0.4389095	total: 1.11s	remaining: 682ms
619:	learn: 0.4388285	total: 1.11s	remaining: 680ms
620:	learn: 0.4386527	total: 1.11s	remaining: 679ms
621:	learn: 0.4383485	total: 1.11s	remaining: 677ms
622:	learn: 0.4383155	total: 1.11s	remaining: 675ms
623:	learn: 0.4381594	total: 1.12s	remaining: 674ms
624:	learn: 0.4379971	total: 1.12s	remaining: 672ms
625:	learn: 0.4378747	total: 1.12s	remaining: 670ms
626:	learn: 0.4374763	total: 1.12s	remaining: 668ms
627:	learn: 0.4373475	total: 1.12s	remaining: 666ms
628:	learn: 0.4372922	total: 1.13s	remaining: 664ms
629:	learn: 0.4372299	total: 1.13s	remaining: 662ms
630:	learn: 0.4372033	total: 1.13s	remaining: 660ms
631:	learn: 0.4370789	total: 1.13s	remaining: 658ms
632:	learn: 0.4369424	total: 1.13s	remaining: 656ms
633:	learn: 0.4368839	total: 1.13s	remaining: 654ms
634:	learn: 0.4366866	total: 1.13s	remaining:

<catboost.core.CatBoostClassifier at 0x7f1f877b95d0>

In [14]:
accuracy = model.score(x, y) * 100
print(f'with Cat boost the accuracy was(with out split): {accuracy.round(2)}%')

with Cat boost the accuracy was(with out split): 84.3%
