# Health detector model

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , OneHotEncoder



In [2]:
df_train = pd.read_csv('Train_Data.csv')
df_test = pd.read_csv('Test_Data.csv')

In [3]:
df_train.shape

(25920, 18)

In [4]:
df_test.shape

(6480, 17)

In [5]:
df_train=df_train.dropna(how='any')

## Replacing missing values with mode

In [6]:
import pandas as pd
from sklearn.impute import SimpleImputer
# Create an instance of SimpleImputer with the 'mean' strategy
imputer = SimpleImputer(strategy='most_frequent')

# Select the columns with missing values
columns_with_missing = ['Food preference','Follow Diet','Physical activity','Regular sleeping hours','Alcohol consumption','Social interaction','Taking supplements','Mental health management','Illness count last year']  # Replace with actual column names

# Fill missing values with the mean of each column
df_test[columns_with_missing] = imputer.fit_transform(df_test[columns_with_missing])
df_test.head()


Unnamed: 0,ID1,Specific ailments,ID2,Food preference,Age,BMI,Smoker?,Living in?,Any heriditary condition?,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,Taking supplements,Mental health management,Illness count last year
0,28534,2,3306,DX6,38,18.879331,YES,URBAN,Stable,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,7970,5,5573,DX1,46,21.231991,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,22039,37,9305,DX4,11,17.867876,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,12332,44,8274,DX5,10,26.886096,NO,RURAL,Stable,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
4,4116,37,4558,DX3 DX4,3,23.362746,NO,RURAL,Stable,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0


In [7]:
df_train.head()

Unnamed: 0,ID1,Specific ailments,ID2,Food preference,Age,BMI,Smoker?,Living in?,Any heriditary condition?,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,Taking supplements,Mental health management,Illness count last year,Healthy
0,2408,44,2668,DX6,49,20.50047,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,25063,39,10363,DX3 DX4,20,26.07658,NO,URBAN,Stable,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
2,26798,29,132,DX6,1,21.420866,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,31907,27,10499,DX1,30,25.203247,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
4,26412,9,7963,DX6,40,19.355846,YES,RURAL,Stable,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,1


In [8]:
df_test.head()

Unnamed: 0,ID1,Specific ailments,ID2,Food preference,Age,BMI,Smoker?,Living in?,Any heriditary condition?,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,Taking supplements,Mental health management,Illness count last year
0,28534,2,3306,DX6,38,18.879331,YES,URBAN,Stable,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,7970,5,5573,DX1,46,21.231991,NO,URBAN,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,22039,37,9305,DX4,11,17.867876,NO,RURAL,Stable,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,12332,44,8274,DX5,10,26.886096,NO,RURAL,Stable,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
4,4116,37,4558,DX3 DX4,3,23.362746,NO,RURAL,Stable,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0


In [9]:
df_test = pd.get_dummies(df_test, columns=['Food preference'])


In [10]:
df_test = pd.get_dummies(df_test, columns=['Smoker?','Living in?','Any heriditary condition?'])


In [11]:
df_test.head()

Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX4,Food preference_DX4 DX5,Food preference_DX5,Food preference_DX6,Smoker?_Cannot say,Smoker?_NO,Smoker?_YES,Living in?_RURAL,Living in?_URBAN,Any heriditary condition?_Stable
0,28534,2,3306,38,18.879331,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,1,0,1,1
1,7970,5,5573,46,21.231991,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,1,1
2,22039,37,9305,11,17.867876,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,1,0,1
3,12332,44,8274,10,26.886096,1.0,0.0,0.0,1.0,0.0,...,0,0,1,0,0,1,0,1,0,1
4,4116,37,4558,3,23.362746,0.0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,1,0,1,0,1


In [12]:
df_train = pd.get_dummies(df_train, columns=['Food preference','Smoker?','Living in?','Any heriditary condition?'])


In [13]:
df_train.head()

Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX3 DX5,Food preference_DX4,Food preference_DX4 DX5,Food preference_DX5,Food preference_DX6,Smoker?_NO,Smoker?_YES,Living in?_RURAL,Living in?_URBAN,Any heriditary condition?_Stable
0,2408,44,2668,49,20.50047,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,1,0,1
1,25063,39,10363,20,26.07658,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,1,0,0,1,1
2,26798,29,132,1,21.420866,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,1,0,0,1,1
3,31907,27,10499,30,25.203247,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,1
4,26412,9,7963,40,19.355846,1.0,0.0,0.0,0.0,1.0,...,0,0,0,0,1,0,1,1,0,1


In [14]:
df_train_filled= df_train
df_test_filled= df_test


## Processing the data

In [15]:
from sklearn.preprocessing import MinMaxScaler
X= df_train_filled.drop(columns=['Healthy'])
y= df_train_filled['Healthy']
scaler= MinMaxScaler()
X_scaled= scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled , columns =X.columns)
X_scaled_df.head()

Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX3 DX5,Food preference_DX4,Food preference_DX4 DX5,Food preference_DX5,Food preference_DX6,Smoker?_NO,Smoker?_YES,Living in?_RURAL,Living in?_URBAN,Any heriditary condition?_Stable
0,0.072981,0.843137,0.213377,0.742424,0.269257,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
1,0.759892,0.745098,0.829026,0.30303,0.698213,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.812498,0.54902,0.010481,0.015152,0.340061,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
3,0.967405,0.509804,0.839907,0.454545,0.63103,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.800794,0.156863,0.637011,0.606061,0.181204,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [16]:
X_scaled_df.shape

(24937, 50)

In [17]:
from sklearn.preprocessing import MinMaxScaler
X1= df_test_filled
scaler= MinMaxScaler()
X1_scaled= scaler.fit_transform(X1)
X1_scaled_df = pd.DataFrame(X1_scaled , columns =X1.columns)
X1_scaled_df.head()

Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX4,Food preference_DX4 DX5,Food preference_DX5,Food preference_DX6,Smoker?_Cannot say,Smoker?_NO,Smoker?_YES,Living in?_RURAL,Living in?_URBAN,Any heriditary condition?_Stable
0,0.86431,0.038462,0.264066,0.308943,0.615353,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.241328,0.096154,0.445253,0.373984,0.696731,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.667545,0.711538,0.743526,0.089431,0.580367,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.373474,0.846154,0.661125,0.081301,0.892305,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.124572,0.711538,0.36413,0.02439,0.770433,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [18]:
X1_scaled_df.shape

(6480, 51)

In [19]:
X1_scaled_df=X1_scaled_df.drop(columns=['Smoker?_Cannot say'])

In [20]:
X_train = X_scaled_df
y_train = y
X_test = X1_scaled_df

In [21]:
y.head()

0    1
1    1
2    0
3    1
4    1
Name: Healthy, dtype: int64

## Training and predicting 

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
lr = LogisticRegression()
lr.fit(X_train , y_train)
y_pred = lr.predict_proba(X_test)[:,1]
y_preed=(y_pred >= 0.2033).astype(int)
eval_df = df_test_filled.copy()
eval_df['pred'] = y_preed
# eval_df.to_csv('test_with_prediction_DTCEf10.csv' , index=False)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
eval_df.tail(50)

Unnamed: 0,ID1,Specific ailments,ID2,Age,BMI,Follow Diet,Physical activity,Regular sleeping hours,Alcohol consumption,Social interaction,...,Food preference_DX4 DX5,Food preference_DX5,Food preference_DX6,Smoker?_Cannot say,Smoker?_NO,Smoker?_YES,Living in?_RURAL,Living in?_URBAN,Any heriditary condition?_Stable,pred
6430,31915,4,1425,10,24.431574,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,1,0,1,0,1,1
6431,21004,42,5667,51,29.562746,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,1,0,1,0
6432,10370,19,442,3,22.267155,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,1,0,1,1
6433,27572,32,1979,33,21.559845,1.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,1,1,0,1,0
6434,25615,45,1225,39,23.075123,1.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,1,0,1,1
6435,6945,40,2710,55,22.21237,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,1,1,0,1,1
6436,25219,2,10938,60,21.35681,1.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,1,0,1,1
6437,8778,4,4047,36,21.598091,1.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,1,1,0,1,0
6438,2739,14,6253,21,18.220062,1.0,0.0,0.0,0.0,0.0,...,0,0,1,0,1,0,1,0,1,1
6439,25299,50,2580,58,27.416915,1.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,1,0,1,1


In [27]:
eval_df.to_csv('test_point5.csv', index=False)


In [28]:
eval_df.shape

(6480, 52)