In [24]:
import pandas as pd 
import numpy as np 

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict

In [26]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [27]:
train.drop(columns=['NObeyesdad'], inplace=True)
train.drop(columns=['id'], inplace=True)
test.drop(columns=['id'], inplace=True)
train.drop(columns=['CALC'], inplace=True)
test.drop(columns=['CALC'], inplace=True)

In [28]:
train.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'MTRANS'],
      dtype='object')

In [29]:
test.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'MTRANS'],
      dtype='object')

In [30]:
categorical_columns = train.select_dtypes(include=['object'])
for cat in categorical_columns:
    print(pd.DataFrame(train[cat].head(2)))

   Gender
0    Male
1  Female
  family_history_with_overweight
0                            yes
1                            yes
  FAVC
0  yes
1  yes
         CAEC
0   Sometimes
1  Frequently
  SMOKE
0    no
1    no
  SCC
0  no
1  no
                  MTRANS
0  Public_Transportation
1             Automobile


In [31]:
categorical_columns.head()

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,MTRANS
0,Male,yes,yes,Sometimes,no,no,Public_Transportation
1,Female,yes,yes,Frequently,no,no,Automobile
2,Female,yes,yes,Sometimes,no,no,Public_Transportation
3,Female,yes,yes,Sometimes,no,no,Public_Transportation
4,Male,yes,yes,Sometimes,no,no,Public_Transportation


In [32]:
categorical_columns = train.select_dtypes(include=['object'])
for cat in categorical_columns:
    print(cat)

Gender
family_history_with_overweight
FAVC
CAEC
SMOKE
SCC
MTRANS


In [33]:
t=train['MTRANS'].unique()
print(t)

['Public_Transportation' 'Automobile' 'Walking' 'Motorbike' 'Bike']


In [34]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

train['Gender'] = label_encoder.fit_transform(train['Gender'])
test['Gender'] = label_encoder.transform(test['Gender'])

In [35]:
train['family_history_with_overweight'] = label_encoder.fit_transform(train['family_history_with_overweight'])
test['family_history_with_overweight'] = label_encoder.transform(test['family_history_with_overweight'])

In [36]:
train['FAVC'] = label_encoder.fit_transform(train['FAVC'])
test['FAVC'] = label_encoder.transform(test['FAVC'])

In [37]:
train['CAEC'] = label_encoder.fit_transform(train['CAEC'])
test['CAEC'] = label_encoder.transform(test['CAEC'])

In [38]:
train['SMOKE'] = label_encoder.fit_transform(train['SMOKE'])
test['SMOKE'] = label_encoder.transform(test['SMOKE'])

In [39]:
train['SCC'] = label_encoder.fit_transform(train['SCC'])
test['SCC'] = label_encoder.transform(test['SCC'])

In [40]:
train['MTRANS'] = label_encoder.fit_transform(train['MTRANS'])
test['MTRANS'] = label_encoder.transform(test['MTRANS'])

In [41]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13840 entries, 0 to 13839
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          13840 non-null  int64  
 1   Age                             13840 non-null  float64
 2   Height                          13840 non-null  float64
 3   Weight                          13840 non-null  float64
 4   family_history_with_overweight  13840 non-null  int64  
 5   FAVC                            13840 non-null  int64  
 6   FCVC                            13840 non-null  float64
 7   NCP                             13840 non-null  float64
 8   CAEC                            13840 non-null  int64  
 9   SMOKE                           13840 non-null  int64  
 10  CH2O                            13840 non-null  float64
 11  SCC                             13840 non-null  int64  
 12  FAF                             

In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 15 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          20758 non-null  int64  
 1   Age                             20758 non-null  float64
 2   Height                          20758 non-null  float64
 3   Weight                          20758 non-null  float64
 4   family_history_with_overweight  20758 non-null  int64  
 5   FAVC                            20758 non-null  int64  
 6   FCVC                            20758 non-null  float64
 7   NCP                             20758 non-null  float64
 8   CAEC                            20758 non-null  int64  
 9   SMOKE                           20758 non-null  int64  
 10  CH2O                            20758 non-null  float64
 11  SCC                             20758 non-null  int64  
 12  FAF                             

In [43]:
# train = train.fillna(-1).drop(["id", "NObeyesdad"], axis=1)
# test = test.fillna(-1).drop(["id"], axis=1)

In [44]:
X = pd.concat([train, test], ignore_index=True)
y = [0] * len(train) + [1] * len(test)

In [45]:
# print(type(train))
# print(type(test))


In [46]:
model = RandomForestClassifier()
cv_preds = cross_val_predict(model, X, y, cv=5, n_jobs=-1,method='predict_proba')
print(roc_auc_score(y_true=y, y_score=cv_preds[:,1]))

0.49765679866025603


## We see that the training and test data are both from the same distribution, as we get an AUC score 0f 0.49 which tells us that the two data sets are indistiguisahble