In [4]:
from helper_functions.get_processed_data import *
from model_training_functions.kfold_cross_validation import *
from sklearn_extra.cluster import KMedoids

path = 'XY_train (2).csv'

data = pd.read_csv(path)

In [5]:
%%capture 
data = get_processed_data(data)

In [6]:
data.head()

Unnamed: 0,Has relevent experience,Full time course,training_hours,experience,qualification_score,city_development_index,target
1,1,0,0.779104,0.24,0.418507,0.897,0
4,1,1,0.125373,0.16,0.649334,0.555,1
5,1,0,0.050746,1.0,0.0458,0.897,0
6,1,0,0.083582,0.24,0.449041,0.92,0
7,1,0,0.361194,0.52,0.281194,0.698,1


## Decision Tree

In [7]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import roc_auc_score
from model_training_functions.decision_tree import *


### Tuning Hyperparameters

In [8]:
max_depth(data, do_plot=True)
criterion(data, do_plot=True)
min_samples_leaf(data, do_plot=True)
min_impurity_decrease(data, do_plot=True)



Test accuracy: 0.799 +/- 0.015
Training accuracy: 0.799 +/- 0.002


Test accuracy: 0.799 +/- 0.016
Training accuracy: 0.800 +/- 0.002


Test accuracy: 0.800 +/- 0.014
Training accuracy: 0.801 +/- 0.002


Test accuracy: 0.799 +/- 0.014
Training accuracy: 0.801 +/- 0.002


Test accuracy: 0.798 +/- 0.015
Training accuracy: 0.802 +/- 0.002


Test accuracy: 0.797 +/- 0.014
Training accuracy: 0.803 +/- 0.001


Test accuracy: 0.794 +/- 0.013
Training accuracy: 0.805 +/- 0.001


Test accuracy: 0.790 +/- 0.014
Training accuracy: 0.810 +/- 0.002


Test accuracy: 0.785 +/- 0.015
Training accuracy: 0.815 +/- 0.002


KeyError: 'Test Score'

<Figure size 864x360 with 0 Axes>

### Picked model training

In [None]:
dt_model = DecisionTreeClassifier(max_depth=5, criterion='entropy', min_samples_leaf=100, min_impurity_decrease=0.0001)
train_model_by_kfold(data, dt_model)

In [None]:
plt.figure(figsize=(70,30))
plot_tree(dt_model, filled=True, class_names=True, max_depth=4, fontsize=25)
plt.show()

## Artificial Neural Networks

In [None]:
from sklearn.neural_network import MLPClassifier

### No Hyperparamteres

In [None]:
ann_model = MLPClassifier()
train_model_by_kfold(data, ann_model)

In [None]:
from model_training_functions.artificial_neural_networks import *
max_iterations(data,True)
hidden_layer_sizes(data,True)
activations_function(data, True)

### Tuned Model Training

__Note:__  
Hyperparameters functions take a lot of time, so we left them out of the notebook.  
You can import them from- **model_training_functions.decision_tree**


In [None]:
ann_model = MLPClassifier(max_iter=800,hidden_layer_sizes=(100, 50, 10), activation='relu',n_iter_no_change= 15)
train_model_by_kfold(data, ann_model)

## SVM

In [None]:
from sklearn.svm import SVC

svm_model= SVC(C=1, kernel='linear',decision_function_shape='ovo', random_state=42, probability=True)
#train_model_by_kfold(data, svm_model)

In [None]:
def get_C_SVM(df):
    all_scores = pd.DataFrame()
    for c in np.arange(1, 2, 0.1):
        model = SVC(C=c, kernel='linear', random_state=42, probability=True)
        scores = train_model_by_kfold(df, model)
        scores['C'] = c
        all_scores = all_scores.append(scores, ignore_index=True)
    return  all_scores

In [None]:
C=get_C_SVM(data)
c.head(10)

In [None]:

def get_decision_function_shape_SVM(df):
    decision_function_shape= ['ovo', 'ovr']
    all_scores = pd.DataFrame()
    for decision in decision_function_shape:
        model =  SVC(C=1, kernel='linear',decision_function_shape=decision, random_state=42, probability=True)
        scores = train_model_by_kfold(df, model)
        scores['decision_function_shape'] = decision
        all_scores = all_scores.append(scores, ignore_index=True)
    return  all_scores

In [None]:
des = get_decision_function_shape_SVM(data)
des. head()

In [None]:
def get_result_coeff_and_intrec(df):
    x=df.drop(columns=['target'])
    y=df['target']
    SVM_best_model= SVC(C=1, kernel='linear',decision_function_shape='ovo', random_state=42, probability=True)
    SVM_best_model.fit(x,y)
    print('Coefficients: \n',SVM_best_model.coef_)
    print('Intercepts: \n', SVM_best_model.intercept_)
    return SVM_best_model

In [None]:
best_model= get_result_coeff_and_intrec(data) 

## Unsupervised Learning - Clustering

In [None]:
import seaborn as sns

model = KMedoids(n_clusters=3, random_state=42)
model.fit(data.drop('target', axis=1))
data['cluster'] = model.predict(data.drop('target', axis=1))
data = reduce_dimensionality(data)


In [None]:
sns.scatterplot(x='PC-1', y='PC-2', hue='cluster', data=data, palette='Accent')


## Model improvement

In [None]:
# Get initial data
df = pd.read_csv(path)


In [None]:
%%capture
df = drop_nan(df)
df = replace_values(df)
df = fill_nan_values(df)
df = extract_features(df)
df = represent_data(df)
df = df.drop(
    columns=[
        'city', 'enrollee_id', 'experience',
        'training_hours', 'relevant_experience_years',
        'Full time course', 'Has relevent experience',
        'Female', 'Male', 'enrollment', 'major_discipline',
            ])

In [None]:
%%capture



In [None]:
dt_model = DecisionTreeClassifier(max_depth=5, criterion='entropy', min_samples_leaf=100, min_impurity_decrease=0.0001)
train_model_by_kfold(df, dt_model)

In [None]:
ann_model = MLPClassifier(max_iter=100,hidden_layer_sizes=(800, 50, 10), activation='relu',n_iter_no_change= 15)
train_model_by_kfold(df, ann_model)