In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.neural_network import MLPClassifier

train_file_path = 'C:\\Users\\disha\\Documents\\Datasets\\Pet Adoption\\Dataset\\train.csv'  # Enter training file path
test_file_path = 'C:\\Users\\disha\\Documents\\Datasets\\Pet Adoption\\Dataset\\test.csv' # Enter Test file path

In [5]:
# Implementation with respect to MLP

num_neighbors = 10   #neighbors for KNN Imputer
df_train = pd.read_csv(train_file_path)
#df_train.groupby(['pet_category']).agg(['count'])
df_train.head()
df_train.tail()

#drop pet_id
df_train.drop(columns=['pet_id'],inplace=True)

#label encode color_type
label_encoder = LabelEncoder()
label_encoder.fit(df_train['color_type'])
df_train['color_type'] = label_encoder.transform(df_train['color_type'])

#get into datetime format and save only date
df_train['issue_date'] = pd.to_datetime(df_train['issue_date'])
df_train['listing_date'] = pd.to_datetime(df_train['listing_date'])
df_train['issue_date'] = pd.to_datetime(df_train['issue_date'].dt.date)
df_train['listing_date'] = pd.to_datetime(df_train['listing_date'].dt.date)

#fill null values by KNN Imputer
imputer = KNNImputer(n_neighbors=num_neighbors)
df_train['condition'] = imputer.fit_transform(df_train.iloc[:,2:3])

#get count where issue date is after listing date
print((df_train['listing_date']<=df_train['issue_date']).sum())

#Drop rows where issue date is after listing date
df_train = df_train.loc[(df_train['listing_date']>=df_train['issue_date'])].copy()
#create new column with difference between issue date and listing date
df_train['gap'] = (df_train['listing_date'] - df_train['issue_date']).dt.total_seconds()/(60*60*24)
df_train = pd.concat([df_train.iloc[:,:2],df_train.iloc[:,10:],df_train.iloc[:,2:10]],axis=1).copy()

#drop issue date and listing date and color type
df_train.drop(columns=['issue_date','listing_date'],inplace=True)
df_train.head()

#df_train = df_train[(z < 3).all(axis=1)]

#one hot encoding
Y = df_train.iloc[:,-2:].to_numpy()
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [2])], remainder='passthrough') 
columnTransformer.fit(df_train.iloc[:,:-2])
df_train = columnTransformer.transform(df_train.iloc[:,:-2]).toarray()
df_train = np.concatenate((df_train,Y),axis=1)


col = list(range(56))
col_names = col + ['gap','condition','length(m)','height(cm)','X1','X2','breed_category','pet_category']
df_train = pd.DataFrame(df_train,columns=col_names)
df_train = pd.concat([df_train.iloc[:,56:58],df_train.iloc[:,:56],df_train.iloc[:,58:]],axis=1)


#change dtype of breed_category and pet category
df_train['breed_category'] = df_train['breed_category'].astype('int64')
df_train['pet_category'] = df_train['pet_category'].astype('int64')
df_train.tail()

df_train.isnull().sum().sum()

#check correlation
df_train.corr()

df1 = df_train.drop(columns=['pet_category'])

x_train1 = df1.iloc[:,:-1]
y_train1 = df1.iloc[:,-1:]
print('Before Balancing: ',Counter(y_train1['breed_category']))

# over1 = SMOTE(sampling_strategy={0:9000,1:8357,2:6000})
# under1 = RandomUnderSampler(sampling_strategy={0:6000,1:6000,2:6000})
# steps1 = [('o', over1), ('u', under1)]
# pipeline = Pipeline(steps=steps1)
# x_train1, y_train1 = pipeline.fit_resample(x_train1, y_train1)

# over1 = SMOTE()
# x_train1,y_train1 = over1.fit_resample(x_train1,y_train1)
# print('After Balancing: ',Counter(y_train1))

scaler1 = MinMaxScaler()
scaler1.fit(x_train1)
x_train1 = scaler1.transform(x_train1)

df2 = df_train

x_train2 = df2.iloc[:,:-1]
y_train2 = df2.iloc[:,-1:]
print('Before Balancing: ',Counter(y_train2['pet_category']))

# over2 = SMOTE(sampling_strategy={0:6000,1:7184,2:10621,4:6000})
# under2 = RandomUnderSampler(sampling_strategy={0:6000,1:6000,2:6000,4:6000})
# steps2 = [('o', over2), ('u', under2)]
# pipeline = Pipeline(steps=steps2)
# x_train2, y_train2 = pipeline.fit_resample(x_train2, y_train2)

# over2 = SMOTE()
# x_train2,y_train2 = over2.fit_resample(x_train2,y_train2)
# print('After Balancing: ',Counter(y_train2))

scaler2 = MinMaxScaler()
scaler2.fit(x_train2)
x_train2 = scaler2.transform(x_train2)

#Using MLP classfier
model1 = MLPClassifier()
model1.fit(x_train1,y_train1)

model2 = MLPClassifier()
model2.fit(x_train2,y_train2)

df_test = pd.read_csv(test_file_path)
df_test.head()

#drop pet_id
temp_df_test = df_test.drop(columns=['pet_id'])
#label encode
temp_df_test['color_type'] = label_encoder.transform(temp_df_test['color_type'])

#get into datetime format and save only date
temp_df_test['issue_date'] = pd.to_datetime(temp_df_test['issue_date'])
temp_df_test['listing_date'] = pd.to_datetime(temp_df_test['listing_date'])
temp_df_test['issue_date'] = pd.to_datetime(temp_df_test['issue_date'].dt.date)
temp_df_test['listing_date'] = pd.to_datetime(temp_df_test['listing_date'].dt.date)

#fill null values by mode 
#l=temp_df_test.filter(['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2']).mode()
#temp_df_test[['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2']]=temp_df_test[['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2']].fillna(value=l.iloc[0])
temp_df_test['condition'] = imputer.transform(temp_df_test.iloc[:,2:3])

#get count where issue date is after listing date
print((temp_df_test['listing_date']<=temp_df_test['issue_date']).sum())

#Drop rows where issue date is after listing date
temp_df_test = temp_df_test.loc[(temp_df_test['listing_date']>=temp_df_test['issue_date'])].copy()
#create new column with difference between issue date and listing date
temp_df_test['gap'] = (temp_df_test['listing_date'] - temp_df_test['issue_date']).dt.total_seconds()/(60*60*24)
temp_df_test = pd.concat([temp_df_test.iloc[:,:2],temp_df_test.iloc[:,8:],temp_df_test.iloc[:,2:8]],axis=1)

#drop issue date and listing date
temp_df_test.drop(columns=['issue_date','listing_date'],inplace=True)

#one hot encoding
temp_df_test = columnTransformer.transform(temp_df_test).toarray()

col = list(range(56))
col_names = col + ['gap','condition','length(m)','height(cm)','X1','X2']
temp_df_test = pd.DataFrame(temp_df_test,columns=col_names)
temp_df_test = pd.concat([temp_df_test.iloc[:,56:58],temp_df_test.iloc[:,:56],temp_df_test.iloc[:,58:]],axis=1)
temp_df_test.tail()

x_test1 = scaler1.transform(temp_df_test)
pred1 = model1.predict(x_test1)
pred1_df = pd.DataFrame(pred1,columns=['breed_category'])
df_test = pd.concat([df_test,pred1_df],axis=1)

temp_df_test = pd.concat([temp_df_test,pred1_df],axis=1)
x_test2 = scaler2.transform(temp_df_test)
pred2 = model2.predict(x_test2)
pred2_df = pd.DataFrame(pred2,columns=['pet_category'])
df_test = pd.concat([df_test,pred2_df],axis=1)

df_test['breed_category'] = df_test['breed_category'].astype('float64')
df_test.drop(columns=['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2'],inplace=True)
df_test.head()

df_test.to_csv('C:\\Users\\disha\\Downloads\\output_mlp_imputer10.csv',index=False)

2
Before Balancing:  Counter({0: 8999, 1: 8356, 2: 1477})
Before Balancing:  Counter({2: 10621, 1: 7182, 4: 941, 0: 88})
0


In [3]:
# Output using XG Boost algorithm

num_neighbors = 10
df_train = pd.read_csv('C:\\Users\\disha\\Documents\\Datasets\\Pet Adoption\\Dataset\\train.csv')
#df_train.groupby(['pet_category']).agg(['count'])
df_train.head()
df_train.tail()

#drop pet_id
df_train.drop(columns=['pet_id'],inplace=True)

#label encode color_type
label_encoder = LabelEncoder()
label_encoder.fit(df_train['color_type'])
df_train['color_type'] = label_encoder.transform(df_train['color_type'])

#get into datetime format and save only date
df_train['issue_date'] = pd.to_datetime(df_train['issue_date'])
df_train['listing_date'] = pd.to_datetime(df_train['listing_date'])
df_train['issue_date'] = pd.to_datetime(df_train['issue_date'].dt.date)
df_train['listing_date'] = pd.to_datetime(df_train['listing_date'].dt.date)

#fill null values by mode 
#l=df_train.filter(['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2','breed_category','pet_category']).mode()
#df_train[['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2','breed_category','pet_category']]=df_train[['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2','breed_category','pet_category']].fillna(value=l.iloc[0])
#fill values by imputer
imputer = KNNImputer(n_neighbors=num_neighbors)
df_train['condition'] = imputer.fit_transform(df_train.iloc[:,2:3])

#get count where issue date is after listing date
print((df_train['listing_date']<=df_train['issue_date']).sum())

#Drop rows where issue date is after listing date
df_train = df_train.loc[(df_train['listing_date']>=df_train['issue_date'])].copy()
#create new column with difference between issue date and listing date
df_train['gap'] = (df_train['listing_date'] - df_train['issue_date']).dt.total_seconds()/(60*60*24)
df_train = pd.concat([df_train.iloc[:,:2],df_train.iloc[:,10:],df_train.iloc[:,2:10]],axis=1).copy()

#drop issue date and listing date and color type
df_train.drop(columns=['issue_date','listing_date'],inplace=True)
df_train.head()

'''
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(df_train))
print(z)
'''

#df_train = df_train[(z < 3).all(axis=1)]
#df_train.shape

#one hot encoding
Y = df_train.iloc[:,-2:].to_numpy()
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [2])], remainder='passthrough') 
columnTransformer.fit(df_train.iloc[:,:-2])
df_train = columnTransformer.transform(df_train.iloc[:,:-2]).toarray()
df_train = np.concatenate((df_train,Y),axis=1)


col = list(range(56))
col_names = col + ['gap','condition','length(m)','height(cm)','X1','X2','breed_category','pet_category']
df_train = pd.DataFrame(df_train,columns=col_names)
df_train = pd.concat([df_train.iloc[:,56:58],df_train.iloc[:,:56],df_train.iloc[:,58:]],axis=1)


#change dtype of breed_category and pet category
df_train['breed_category'] = df_train['breed_category'].astype('int64')
df_train['pet_category'] = df_train['pet_category'].astype('int64')
df_train.tail()

df_train.isnull().sum().sum()

#check correlation
df_train.corr()

df1 = df_train.drop(columns=['pet_category'])

x_train1 = df1.iloc[:,:-1]
y_train1 = df1.iloc[:,-1:]
print('Before Balancing: ',Counter(y_train1['breed_category']))

# over1 = SMOTE(sampling_strategy={0:9000,1:8357,2:6000})
# under1 = RandomUnderSampler(sampling_strategy={0:6000,1:6000,2:6000})
# steps1 = [('o', over1), ('u', under1)]
# pipeline = Pipeline(steps=steps1)
# x_train1, y_train1 = pipeline.fit_resample(x_train1, y_train1)

# over1 = SMOTE()
# x_train1,y_train1 = over1.fit_resample(x_train1,y_train1)
# print('After Balancing: ',Counter(y_train1))

scaler1 = MinMaxScaler()
scaler1.fit(x_train1)
x_train1 = scaler1.transform(x_train1)

df2 = df_train

x_train2 = df2.iloc[:,:-1]
y_train2 = df2.iloc[:,-1:]
print('Before Balancing: ',Counter(y_train2['pet_category']))

# over2 = SMOTE(sampling_strategy={0:6000,1:7184,2:10621,4:6000})
# under2 = RandomUnderSampler(sampling_strategy={0:6000,1:6000,2:6000,4:6000})
# steps2 = [('o', over2), ('u', under2)]
# pipeline = Pipeline(steps=steps2)
# x_train2, y_train2 = pipeline.fit_resample(x_train2, y_train2)

# over2 = SMOTE()
# x_train2,y_train2 = over2.fit_resample(x_train2,y_train2)
# print('After Balancing: ',Counter(y_train2))

scaler2 = MinMaxScaler()
scaler2.fit(x_train2)
x_train2 = scaler2.transform(x_train2)

#Using XGB Classifier
model1 = XGBClassifier()
model1.fit(x_train1,y_train1)

model2 = XGBClassifier()
model2.fit(x_train2,y_train2)

df_test = pd.read_csv('C:\\Users\\disha\\Documents\\Datasets\\Pet Adoption\\Dataset\\test.csv')
df_test.head()

#drop pet_id
temp_df_test = df_test.drop(columns=['pet_id'])
#label encode
temp_df_test['color_type'] = label_encoder.transform(temp_df_test['color_type'])

#get into datetime format and save only date
temp_df_test['issue_date'] = pd.to_datetime(temp_df_test['issue_date'])
temp_df_test['listing_date'] = pd.to_datetime(temp_df_test['listing_date'])
temp_df_test['issue_date'] = pd.to_datetime(temp_df_test['issue_date'].dt.date)
temp_df_test['listing_date'] = pd.to_datetime(temp_df_test['listing_date'].dt.date)

#fill null values by mode 
#l=temp_df_test.filter(['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2']).mode()
#temp_df_test[['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2']]=temp_df_test[['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2']].fillna(value=l.iloc[0])
temp_df_test['condition'] = imputer.transform(temp_df_test.iloc[:,2:3])

#get count where issue date is after listing date
print((temp_df_test['listing_date']<=temp_df_test['issue_date']).sum())

#Drop rows where issue date is after listing date
temp_df_test = temp_df_test.loc[(temp_df_test['listing_date']>=temp_df_test['issue_date'])].copy()
#create new column with difference between issue date and listing date
temp_df_test['gap'] = (temp_df_test['listing_date'] - temp_df_test['issue_date']).dt.total_seconds()/(60*60*24)
temp_df_test = pd.concat([temp_df_test.iloc[:,:2],temp_df_test.iloc[:,8:],temp_df_test.iloc[:,2:8]],axis=1)

#drop issue date and listing date
temp_df_test.drop(columns=['issue_date','listing_date'],inplace=True)

#one hot encoding
temp_df_test = columnTransformer.transform(temp_df_test).toarray()

col = list(range(56))
col_names = col + ['gap','condition','length(m)','height(cm)','X1','X2']
temp_df_test = pd.DataFrame(temp_df_test,columns=col_names)
temp_df_test = pd.concat([temp_df_test.iloc[:,56:58],temp_df_test.iloc[:,:56],temp_df_test.iloc[:,58:]],axis=1)
temp_df_test.tail()

x_test1 = scaler1.transform(temp_df_test)
pred1 = model1.predict(x_test1)
pred1_df = pd.DataFrame(pred1,columns=['breed_category'])
df_test = pd.concat([df_test,pred1_df],axis=1)

temp_df_test = pd.concat([temp_df_test,pred1_df],axis=1)
x_test2 = scaler2.transform(temp_df_test)
pred2 = model2.predict(x_test2)
pred2_df = pd.DataFrame(pred2,columns=['pet_category'])
df_test = pd.concat([df_test,pred2_df],axis=1)

df_test['breed_category'] = df_test['breed_category'].astype('float64')
df_test.drop(columns=['issue_date','listing_date','condition','color_type','length(m)','height(cm)','X1','X2'],inplace=True)
df_test.head()

df_test.to_csv('C:\\Users\\disha\\Downloads\\output_xgb_imputer10.csv',index=False)

2
Before Balancing:  Counter({0: 8999, 1: 8356, 2: 1477})
Before Balancing:  Counter({2: 10621, 1: 7182, 4: 941, 0: 88})
0
