In [1]:
config = {
    'isTest':False,
    'ordinalEncoder':['mode'],
    'Outlier':['duration','popularity'],
    'transformMethod':['log'
                       #,'loglog','sqrt','inverse'
                      ],
    'kImputation':15,
    'xgbModel':{'max_depth':[8],
                'n_estimators':[200],
                'learning_rate':[0.1],
                'random_state':[0]
               },
    'consolidation':['track_name','time_signature'],
    'correlation':[['f2','f3']],
    'negativeCorrelation':[['f5','f2']],
    'best_params':{'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 200}
}


In [2]:
import numpy as np
import pandas as pd
import os

def reader(config):
    pd.set_option('display.max_columns', None)
    if config['isTest']:
        data1 = pd.read_csv('data1_test.csv')
        data2 = pd.read_csv('data2_test.csv')
        data3 = pd.read_csv('data3_test.csv')
    else:
        data1 = pd.read_csv('data1.csv')
        data2 = pd.read_csv('data2.csv')
        data3 = pd.read_csv('data3.csv')
    data = pd.merge(data1, data2)
    data = pd.merge(data, data3)
    data.rename(columns={
        'feature_1':'f1',
        'feature_2':'f2',
        'feature_3':'f3',
        'feature_4':'f4',
        'feature_5':'f5',
        'feature_6':'f6',
        'feature_7':'f7',
        'feature_8':'f8',
        'feature_9':'f9',
        },inplace=True)
    data.rename(columns={
    'Artist Name':'artist_name',
    'Track Name':'track_name',
    'Class':'class',
    'Popularity':'popularity',
},inplace=True)
    return data

In [3]:
from sklearn.preprocessing import OrdinalEncoder
def ordinal(config,data):
    data=data.copy()
    column = config['ordinalEncoder']
    oe = OrdinalEncoder()
    data[column] = oe.fit_transform(data[column])
    return data , oe

# Imputer

In [4]:
from sklearn.impute import KNNImputer
import pandas as pd
def k_imputer (config,data):
    df=data.copy()
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    df_numeric = df[numeric_columns]
    df_numeric=df_numeric.drop(columns="class")
    imputer = KNNImputer(n_neighbors=config["kImputation"])
    imputed = imputer.fit_transform(df_numeric)
    df_imputed = pd.DataFrame(imputed,columns=df_numeric.columns)
    df[df_imputed.columns] = df_imputed
    return df

# Feature Engineering

In [5]:
def consol_tr(df):

    df.loc[df["track_name"]!='track_1','track_name']=0
    df.loc[df["track_name"]=='track_1','track_name']=1
    df["track_name"]=df["track_name"].astype(int)
    return df

In [6]:
def correlation (config,data):
    df=data.copy()
    i= 1
    for columns in config["correlation"]:
        df[f"c{i}"] =df[columns[0]]+df[columns[1]]
        i=i+1
    return df

In [7]:
def negative_correlation (config,data):
    df=data.copy()
    i = 1
    for columns in config["negativeCorrelation"]:
        df[f"nc{i}"] =df[columns[0]]/0.001+(1-df[columns[1]])
        i=i+1
    return df

In [8]:
def consol_ts(df):

    df.loc[df["time_signature"]!='type 4','time_signature']=0
    df.loc[df["time_signature"]=='type 4','time_signature']=1
    df["time_signature"]=df["time_signature"].astype(int)
    return df

# Imbalanced

In [9]:
from imblearn.combine import SMOTETomek
def balance(data):
    df=data.copy()
    x=df.drop(columns="class")
    y=df["class"]
    sm=SMOTETomek(random_state=0)
    xres, yres = sm.fit_resample(x, y)
    df=pd.concat([xres,yres],axis=1)
    return df


# Outlier

In [10]:
def qr_outlier(config,data):
    df=data.copy()
    for colname in config['Outlier']:
        i=df[colname]
        q1=i.quantile(q=0.25)
        q3=i.quantile(q=0.75)
        iqr=q3-q1
        lower=q1-1.5*iqr
        upper=q3+1.5*iqr
        up=i[i>upper]
        low=i[i<lower]
        df.loc[up.index,colname]=upper
        df.loc[low.index,colname]=lower
        out=pd.concat([up,low])
    return df

In [11]:
def class_based_qr(config,df):
    data=df.copy()
    for i in data["class"].unique():
        data[data["class"]==i]=qr_outlier(config,data[data["class"]==i])
    return data

# Model

In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
def xgb_model(config,data):

    x=data.drop(columns="class").copy()
    y=data["class"].copy()
    la=LabelEncoder()
    y=la.fit_transform(y)
    model=XGBClassifier();
    gs=GridSearchCV(model,config['xgbModel'],cv=5,refit='f1',return_train_score=True)
    xtrain,xvalid,ytrain,yvalid=train_test_split(x,y,test_size=0.2)
    gs.fit(x,y)
    config["best_params"]=gs.best_params_
    return gs,la

In [13]:
data = reader(config)
data = consol_tr(data)
data = consol_ts(data)
data,oe = ordinal(config,data)
data = class_based_qr(config,data)
data = k_imputer(config,data)
data = correlation(config,data)
data = negative_correlation(config,data)
data.drop(columns=['id',"artist_name"],inplace=True)
data=balance(data)

In [14]:
xgb,la=xgb_model(config,data)
print(xgb.best_score_)
print(xgb.cv_results_["mean_test_score"])
print(xgb.cv_results_["mean_train_score"])

0.9331593730112513
[0.93315937]
[0.99933888]



# Test

In [15]:
config_test={
    'isTest':True,
    'ordinalEncoder':['mode'],
    'oneHotEncoder':['mode'
#         'track_name',
                    ],
    'qrOutlier':['duration','popularity'],
    'zOutlier':['duration','popularity'],
    'sOutlier':['duration','popularity'],
    'transformMethod':['log'
                       #,'loglog','sqrt','inverse'
                      ],
    'kImputation':10,
    'sImputeColumns':['popularity','f1','f3','f6','f8','f9','key'],
    'strategy':'median'
                #,'median','most_frequent'
               ,
    'kModel':range(10,50),
    'rfcModel':{'n_estimators':range(50,500,50)},
#     'xgbModel':{'max_depth':range(4,15,2),
#                 'n_estimators':range(200,1000,200),
#                 'learning_rate':[0.005,0.01,0.05,0.1,0.5]
#                } ,
    'xgbModel':{'max_depth':[8],
                'n_estimators':[200],
                'learning_rate':[0.1],
                'random_state':[0]
               } ,
    'consolidation':['track_name','time_signature'],
    'correlation':[['f2','f3']],
    'negativeCorrelation':[['f5','f2']]
}


In [16]:
test=reader(config_test)
test=consol_tr(test)
test=consol_ts(test)
test[config_test['ordinalEncoder']] = oe.transform(test[config_test['ordinalEncoder']])
test = correlation(config_test,test)
test = negative_correlation(config_test,test)

test.drop(columns=['id',"artist_name"],inplace=True)
display(test)

Unnamed: 0,track_name,popularity,duration,f1,f2,f3,f4,f5,f6,f7,f8,f9,key,mode,time_signature,c1,nc1
0,1,35.0,241.080,0.714870,0.531433,0.744229,0.039064,0.157631,2.339259e-02,0.689119,0.453246,0.536206,2.0,1.0,1,1.275662,158.099089
1,1,13.0,277.788,0.639552,0.606524,0.796212,0.017386,0.000177,8.674697e-01,0.243117,0.497774,0.611248,5.0,0.0,1,1.402736,0.570183
2,1,21.0,295.673,0.553475,0.988987,0.859524,0.034342,0.000282,2.660635e-01,0.129330,0.163301,0.536443,2.0,0.0,1,1.848511,0.293142
3,1,76.0,208.026,0.694427,0.372240,0.770782,0.016313,0.725904,8.132528e-01,0.277659,0.551621,0.482431,7.0,0.0,1,1.143022,726.531374
4,1,48.0,274.480,0.671831,0.658587,0.854458,0.000215,0.027108,4.919684e-06,0.136442,0.311380,0.429131,9.0,0.0,1,1.513045,27.449847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,1,30.0,326.640,0.473854,0.309164,0.672282,0.005151,0.719880,8.132538e-07,0.035761,0.381796,0.350061,9.0,0.0,1,0.981446,720.570354
746,1,50.0,188.000,0.570691,0.820783,0.873924,0.022752,0.347390,7.409646e-06,0.445291,0.792896,0.465697,2.0,0.0,1,1.694707,347.568775
747,1,60.0,194.800,0.149989,0.954945,0.927803,0.094226,0.001918,2.008034e-05,0.083613,0.451175,0.832142,3.0,1.0,1,1.882749,1.962725
748,1,45.0,157.250,0.683667,0.100912,0.564897,0.041425,0.943775,9.236947e-01,0.078635,0.019882,0.489682,4.0,0.0,0,0.665809,944.674188


In [17]:
pred=xgb.predict(test)
pred=la.inverse_transform(pred)
display(pred)
sub=test.copy()
sub=sub.drop(columns=sub.columns)
sub["Class"]=pred
sub.to_csv("lastt.csv")

array([10,  6,  9, 10, 10,  4,  9, 10, 10,  4,  6, 10,  9, 10,  4,  9,  8,
        8, 10,  9, 10,  9, 10,  4, 10,  4,  9,  4, 10,  9, 10,  4,  4,  8,
        6, 10,  6,  4,  8,  4,  8,  8, 10, 10,  8,  8,  8,  8, 10, 10, 10,
        8, 10,  6,  4,  4,  9,  6,  9,  4,  9,  6,  8,  8,  4,  9,  9,  6,
        8,  6,  9,  9,  4,  6,  4,  9,  8,  8,  4, 10,  9,  8,  4,  6,  6,
        8,  9,  8,  4,  8, 10,  4,  9,  4, 10,  9,  8,  4,  8,  9, 10,  9,
       10,  6,  8,  9,  6,  4,  9, 10,  6, 10,  6,  6,  9, 10, 10, 10,  8,
        4,  4,  8,  9,  9,  8, 10,  4,  8,  6,  6,  4,  8,  6,  9,  6,  8,
        8,  9, 10,  6,  9,  4,  8,  9,  9, 10,  4, 10,  8,  8,  8,  9,  6,
        8,  4,  8,  4,  4,  9,  4,  4,  8,  4, 10,  9,  8, 10,  9,  6, 10,
        9,  9,  4,  8,  6,  4,  9,  4, 10, 10,  9, 10,  9,  6,  4,  4, 10,
       10,  9,  6,  8, 10,  9,  8,  6,  8,  4,  6,  4,  9,  8,  9,  9, 10,
        8,  9,  4, 10, 10,  9, 10,  9,  4,  6, 10,  4, 10, 10,  9,  6,  8,
       10, 10,  6,  6,  8