<a href="https://colab.research.google.com/github/444vj/child-malnutrition-01/blob/main/random_forest2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Read the data
data = pd.read_csv('/content/merged_dataset.csv')

# Separate predictors and target
X = data.drop(columns=['Child Underweight'])  # Predictors
y = data['Child Underweight']  # Target

# Split data into train and test sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numerical and categorical columns
numerical_cols = ['Women Underweight', 'GDP per capita', 'Govt. Health Expenditure', 'MMR', "Mother's age", 'Female Education']
categorical_cols = ['Country Name']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestRegressor(n_estimators=300)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model
clf.fit(xtrain, ytrain)


In [11]:
from sklearn.metrics import accuracy_score, classification_report

# Discretize 'Child Underweight' into classes
threshold = 10  # Define a threshold for classifying underweight (for example, <= 10 is underweight, > 10 is not underweight)
ytrain_class = (ytrain <= threshold).astype(int)
ytest_class = (ytest <= threshold).astype(int)

# Fit the model again with the new target variable
clf.fit(xtrain, ytrain_class)

# Predictions on the test set
ypred_class = clf.predict(xtest)

# Accuracy score
accuracy = accuracy_score(ytest_class, ypred_class)
print("Accuracy Score:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(ytest_class, ypred_class))


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [12]:
print(data['Child Underweight'].unique())


[ 2.05 19.1  28.5  29.1  12.3  22.1  29.3   3.   12.7   2.3   3.3   6.97
  5.8   2.13 12.   11.6  16.8  36.1   1.54  2.25 10.6  27.6  11.4  21.7
 23.    3.8   2.8  18.1  26.9  25.3  25.5   8.86  7.38  1.4  16.2   5.4
 13.    7.2  40.8   1.83  2.38 10.1   2.6   3.05 30.6   3.5  20.   29.
 12.9  29.4  13.3  24.7   2.15 12.1  17.2  36.8   1.9   2.53 23.2  10.7
 13.6  28.3  11.8  24.6   1.81  2.2  27.2  36.5   1.44 27.    1.3   9.2
  2.7   8.04 16.1   1.5   6.23 13.5   7.7  41.3   2.32  3.15  7.1  21.
 28.9  13.4  29.6  16.4  23.1  13.8   6.9   4.9  25.8   2.16 10.9  12.6
 17.6  37.5   1.92  2.26 24.2  10.8  14.3   1.94 19.5   1.48 28.4   2.
  9.66  8.7  12.5   1.6   6.1  14.    8.2  41.7  11.7   3.7   3.95 33.8
  6.8  21.9  22.2  29.8   2.4  16.7  14.4   6.87 26.8   2.18 13.1  18.
 38.3   2.46  2.62 11.5  26.2  24.5  14.1  27.9  37.2  30.9   1.51 29.9
  9.36 14.5   5.98 42.2  12.4  39.6   8.95  4.25 35.    6.5  23.5  28.8
 22.9  23.4   6.83 18.4   9.16  6.91 39.    2.98 26.4  30.4  12.8  

In [13]:
from sklearn.metrics import accuracy_score, classification_report

# Discretize 'Child Underweight' into classes using the chosen threshold
threshold = 10
ytrain_class = (ytrain <= threshold).astype(int)
ytest_class = (ytest <= threshold).astype(int)

# Fit the model again with the new binary target variable
clf.fit(xtrain, ytrain_class)

# Predictions on the test set
ypred_class = clf.predict(xtest)

# Accuracy score
accuracy = accuracy_score(ytest_class, ypred_class)
print("Accuracy Score:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(ytest_class, ypred_class))


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [14]:
print("Data type of ytrain:", ytrain.dtype)
print("Data type of ytrain_class:", ytrain_class.dtype)


Data type of ytrain: float64
Data type of ytrain_class: int64


In [16]:
# Fit the model again with the new binary target variable
clf.fit(xtrain, ytrain_class)

# Predictions on the test set
ypred_class = clf.predict(xtest)
print(ypred_class)


[0.         0.         0.86       0.         0.         0.03666667
 1.         0.         0.         1.         1.         0.88333333
 0.02333333 0.88       0.97666667 1.         0.00333333 0.
 0.         0.         1.         1.         0.01333333 1.
 0.         0.99333333 0.99333333 0.61666667 0.04333333 0.
 0.         0.         0.57       0.         1.         0.05333333
 1.         0.         1.         0.         1.         0.
 0.         0.         0.         0.03333333 0.01666667 0.
 0.         0.         1.         1.         0.         0.95333333
 1.         1.         0.         0.         0.99       1.
 0.98333333 0.         0.04      ]


In [17]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier

# Define the RandomForestClassifier
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(n_estimators=300))])

# Fit the model with the binary target variable
clf.fit(xtrain, ytrain_class)

# Predictions on the test set
ypred_class = clf.predict(xtest)

# Accuracy score
accuracy = accuracy_score(ytest_class, ypred_class)
print("Accuracy Score:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(ytest_class, ypred_class))


Accuracy Score: 0.9682539682539683
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        36
           1       1.00      0.93      0.96        27

    accuracy                           0.97        63
   macro avg       0.97      0.96      0.97        63
weighted avg       0.97      0.97      0.97        63

