In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sb
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv("/kaggle/input/hell-week-mic/train.csv")
print(data.info())
pd.set_option('display.max_columns', None)
data.describe(include='all')

In [None]:
data.head()

In [None]:
data.sample(20)

In [None]:
data.columns

Lets experiment with only rows that have all columns, for final model this is too much data loss to be acceptable but trying stuff on this shortened dataset should make tuning easier.

In [None]:
data1 = data.drop(['UID','ph_no', 'cvv','credit_card_number','job','email','url','country','emoji','name'],axis = 1).dropna()
data1.describe()

Function to extract float values:

In [None]:
import re
def extract_float(text):
    float_pattern = r'[-+]?\d+\.\d+'
    floats = re.findall(float_pattern, text)
    if floats:
        return float(floats[0])
    else:
        return None
data2 = data1.drop('state', axis = 1)

In [None]:
data1.head()

In [None]:
for i in data2.columns:
    data2[i] = data2[i].map(extract_float)

In [None]:
data2.describe()

Lets do an xgboost trial run

In [None]:
X = data2.copy()
le = preprocessing.LabelEncoder()
y = pd.DataFrame()
y['state'] = le.fit_transform(data1['state'])
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
xgb_model = xgb.XGBClassifier(objective = 'multi:softmax', num_class = 100, eval_metric = ['merror','mlogloss'], max_depth= 5, learning_rate = 0.4, early_stopping_rounds = 10)
xgb_model.fit(X_train,y_train, verbose = 0, eval_set = [(X_train, y_train), (X_test, y_test)])
results = xgb_model.evals_result()
epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)
fig, ax = plt.subplots(figsize=(9,5))
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
ax.legend()
plt.ylabel('mlogloss')
plt.title('GridSearchCV XGBoost mlogloss')
plt.show()
fig, ax = plt.subplots(figsize=(9,5))
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
ax.legend()
plt.ylabel('merror')
plt.title('GridSearchCV XGBoost merror')
plt.show()
y_pred = xgb_model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Testing various values of maxdepth

In [None]:
data3 = data2.copy()
le = preprocessing.LabelEncoder()
data3['state'] = le.fit_transform(data1['state'])
kf = KFold(3, shuffle = True)
for i in range(4,10):
    print("depth:",i)
    for train_index, test_index in kf.split(data3):
        xgb_model = xgb.XGBClassifier(objective = 'multi:softmax', num_class = 100, eval_metric = ['merror','mlogloss'], max_depth= i, learning_rate = 0.4, early_stopping_rounds = 15)
        train = data3.iloc[train_index]
        test = data3.iloc[test_index]
        X_train = train.drop('state', axis = 1)
        X_test = test.drop('state', axis = 1)
        y_train = train['state']
        y_test = test['state']
        xgb_model.fit(X_train,y_train, verbose = 0, eval_set = [(X_train, y_train), (X_test, y_test)])
        y_pred = xgb_model.predict(X_test)
        print(classification_report(y_test, y_pred))

Optimal value of max depth is 7-8 