# Fast mode

In [1]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from feature_engine.encoding import RareLabelEncoder, OneHotEncoder
from sklearn.preprocessing import RobustScaler,Normalizer,StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from scipy.stats import skew
from sklearn import metrics

In [2]:
def Fill_Null_Add_Imp_Column(data: pd.DataFrame,cols: list):
    a = data.copy()
    for col in cols:
        a[col+"_null"] = np.where(a[col].isnull(),1,0)
        filling_data = a[col].mode()[0]
        a[col].fillna(filling_data,inplace=True)
    return a

In [3]:
def Create_Groups(x):
    if x in ['1','2','3','4','5']:
        return '1-5'
    elif x in ['6','7','8','9','10']:
        return '6-10'
    elif x in ['11','12','13','14','15']:
        return '11-15'
    elif x in ['16','17','18','19','20']:
        return '16-20'
    else: return x

In [4]:
def City_Encoder(df):
    data = df.copy()
    #Determining how many labels to create
    cat = 'city'
    temp_df = pd.Series(data[cat].value_counts() / len(data))

    rare_encoder = RareLabelEncoder(tol=0.05,
                                    n_categories=8,
                                    variables=cat,
                                    replace_with='Other')

    rare_encoder.fit(data)
    data = rare_encoder.transform(data)

    #One Hot Encoding
    one_hot_top = OneHotEncoder(
        top_categories=5,
        variables=cat)

    one_hot_top.fit(data)
    data = one_hot_top.transform(data)
    
    return data

In [5]:
def EnrolledUniversity_Encoder(df):
        data = df.copy()
        cat = 'enrolled_university'
        
        one_hot_top = OneHotEncoder(top_categories=3,
                                    variables=cat)

        one_hot_top.fit(data)
        data = one_hot_top.transform(data)

        return data

In [6]:
def Nulls(df):
    data = df.copy()
    
    data.drop(['gender','company_size','company_type'], 
              axis = 1, 
              inplace = True)
    
    data = Fill_Null_Add_Imp_Column(data,['enrolled_university','major_discipline','last_new_job'])
    
    data['education_level'].fillna("No education", inplace = True)
    data['experience'] = data['experience'].apply(Create_Groups)
    data['experience'].fillna('<1', inplace = True)
    
    return data

In [7]:
def Categorical(X):
    data = X.copy()
    data = City_Encoder(data)
    data['relevent_experience'] = data['relevent_experience'].apply(lambda x: 1 if 'Has' in x else 0)
    data = EnrolledUniversity_Encoder(data)
    data['education_level'] = data['education_level'].replace({"No education": 0,
                                                               "Primary School":1,
                                                               "High School":2,
                                                               "Graduate":3,
                                                               "Masters":4,
                                                               "Phd":5})
    data['major_discipline'] = data['major_discipline'].apply(lambda x: 1 if "STEM" in x else 0)
    data['experience'] = data['experience'].replace({"<1": 0,
                                                     "1-5":1,
                                                     "6-10":2,
                                                     "11-15":3,
                                                     "16-20":4,
                                                     ">20":5})
    data['last_new_job'] = data['last_new_job'].replace({"never": 0,
                                                         "1":1,
                                                         "2":2,
                                                         "3":3,
                                                         "4":4,
                                                         ">4":5})
    return data

In [8]:
def Process_Data(X, standardize = False):
    """ 
    Function that cleans the data from the dataset
    """
    #Removing and Filling Nulls
    data_null = Nulls(X)
    
    #Label Encoders
    data_cleaned = Categorical(data_null)
    
    if standardize:     
        columns = ['city_development_index','training_hours']
        data = data_cleaned[columns]
        scaler = StandardScaler().fit(data)
        data = scaler.transform(data)
        data_cleaned[columns] = data
    
    return data_cleaned

In [9]:
data_train = pd.read_csv('z_train.csv')
data_test = pd.read_csv('z_test.csv')

In [10]:
X = Process_Data(data_train.drop(['target','enrollee_id'], axis = 1))
y = data_train['target']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Models

## ---- Logistic

In [12]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
df_predicted = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
display(df_predicted)

acc = metrics.accuracy_score(y_true = y_test, 
                             y_pred = y_pred)

precision = metrics.precision_score(y_true = y_test, 
                             y_pred = y_pred)

recall = metrics.recall_score(y_true = y_test, 
                             y_pred = y_pred)

f1 = metrics.f1_score(y_true = y_test, 
                             y_pred = y_pred)

print(f"Accuracy: {acc}\nPrecision: {precision}\nRecall: {recall}\nf1: {f1}")

display(metrics.confusion_matrix(y_test,y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Actual,Predicted
5047,0.0,0.0
14023,1.0,0.0
1634,0.0,0.0
2414,0.0,0.0
195,0.0,0.0
...,...,...
13635,0.0,0.0
7461,1.0,1.0
508,1.0,0.0
994,0.0,0.0


Accuracy: 0.7870189171559034
Precision: 0.611731843575419
Recall: 0.2987721691678035
f1: 0.4014665444546288


array([[2194,  139],
       [ 514,  219]], dtype=int64)

## ---- Tree Classification

In [13]:
model = DecisionTreeClassifier(max_depth=6, random_state = 0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
df_predicted = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
display(df_predicted)

acc = metrics.accuracy_score(y_true = y_test, 
                             y_pred = y_pred)

precision = metrics.precision_score(y_true = y_test, 
                             y_pred = y_pred)

recall = metrics.recall_score(y_true = y_test, 
                             y_pred = y_pred)

f1 = metrics.f1_score(y_true = y_test, 
                             y_pred = y_pred)

print(f"Accuracy: {acc}\nPrecision: {precision}\nRecall: {recall}\nf1: {f1}")

display(metrics.confusion_matrix(y_test,y_pred))

Unnamed: 0,Actual,Predicted
5047,0.0,1.0
14023,1.0,1.0
1634,0.0,0.0
2414,0.0,0.0
195,0.0,0.0
...,...,...
13635,0.0,0.0
7461,1.0,1.0
508,1.0,0.0
994,0.0,0.0


Accuracy: 0.7860404435746902
Precision: 0.5803757828810021
Recall: 0.3792633015006821
f1: 0.45874587458745875


array([[2132,  201],
       [ 455,  278]], dtype=int64)

In [14]:
def Tain_Model(model, archive_train, archive_test, standardize = False):
    """ 
        Functions that trains a model and return a df with "enroll_id "and "target"
    """
    #Extraction
    data_train = pd.read_csv(archive_train)
    data_test = pd.read_csv(archive_test)

    #Cleaning
    X_train = Process_Data(data_train.drop(['target','enrollee_id'], axis = 1),
                           standardize = standardize)
    y_train = data_train['target']
    X_test = Process_Data(data_test.drop('enrollee_id', axis = 1),
                          standardize = standardize)
    #Training
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    #Return
    df_predicted = pd.DataFrame({'enrollee_id':data_test['enrollee_id'], 
                                 'target':y_pred})
    
    df_predicted.to_csv('submission.csv', index=False) 

In [15]:
Tain_Model(model = DecisionTreeClassifier(max_depth=6),
           archive_train='z_train.csv', 
           archive_test='z_test.csv',
           standardize=False)

In [16]:
data_test['major_discipline'].isnull().mean()

0.15031315240083507