<a href="https://colab.research.google.com/github/Benita-hills/Poverty-prediction/blob/main/poverty_5k_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [22]:
def load_and_preprocess_data(file_path):
    # Your code for loading and processing the data
    df = pd.read_excel(file_path)
    # Continue with the rest of the preprocessing logic
    return df


In [23]:
file_path = "/content/5000 poverty rprediction.xlsx"
df = load_and_preprocess_data(file_path)



In [24]:
print(df.isnull().sum())

hhid                                   0
sector                                 0
zone                                   0
state                                  0
lga                                    0
Gender                                 0
Educational Level                      0
hhsize                                 0
Marital status                         0
Occupation type                        0
Year of birth                          0
Housing Types                          0
Roofing materia                        0
Cookstove type                         0
Access to electricity                  0
Main Source of Electricity          4197
Income source                          0
income type                            0
wt_final                               0
popw                                   0
Spending on food Items                 0
Spending on food items purchased       0
Spending on non food items             0
totcons_pc                             0
Consumption per 

In [26]:
# Replace all non-numeric values (like '-') with NaN (Not a Number)
df.replace('-', np.nan, inplace=True)

# Convert the column to numeric, forcing any non-numeric values to NaN
df = df.apply(pd.to_numeric, errors='coerce')

# After replacement, handle missing values using the median (or any other method)
df.fillna(df.median(), inplace=True)


  df.replace('-', np.nan, inplace=True)


In [27]:
def detect_outliers_iqr(df, feature):
    # Calculate the Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)

    # Calculate the Interquartile Range (IQR)
    IQR = Q3 - Q1

    # Calculate the lower and upper bounds for outliers
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Return a boolean series that marks outliers (True for outliers)
    return (df[feature] < lower_bound) | (df[feature] > upper_bound)

# Apply the IQR outlier detection for all numerical columns in the dataset
numerical_columns = df.select_dtypes(include=[np.number]).columns

outliers = pd.DataFrame()

for col in numerical_columns:
    outliers[col] = detect_outliers_iqr(df, col)

# Display rows where there are outliers
outlier_rows = df[outliers.any(axis=1)]
print("\nOutlier rows detected:")
print(outlier_rows)

# Count of outliers in each column
print("\nCount of outliers in each column:")
print(outliers.sum())


Outlier rows detected:
         hhid  sector  zone  state   lga  Gender  Educational Level  hhsize  \
0      111074       1     4      1   111       1                  8       3   
1      111077       1     4      1   111       1                  0       3   
2      111075       1     4      1   111       1                  6       7   
3      111079       1     4      1   111       1                  6       3   
4      111071       1     4      1   111       1                  2       8   
...       ...     ...   ...    ...   ...     ...                ...     ...   
4994  3318096       2     3     33  3318       1                  1       4   
4995  3410098       2     2     34  3410       1                  6       1   
4996  3410096       2     2     34  3410       1                  6       2   
4997  3401009       2     2     34  3401       1                  6       6   
4998  3401004       2     2     34  3401       1                  0      12   

      Marital status  Occup

In [29]:
# Remove rows with outliers
df_cleaned = df[~outliers.any(axis=1)]  # ~ is the negation operator (to remove outliers)

# Display cleaned data
print("\nCleaned Data without Outliers:")
print(df_cleaned.head())


Cleaned Data without Outliers:
      hhid  sector  zone  state  lga  Gender  Educational Level  hhsize  \
23  115107       2     4      1  115       1                  6       4   
34  107048       2     4      1  107       1                  2       3   
42  107045       2     4      1  107       1                 13       2   
46  115105       2     4      1  115       1                  6       4   
47  117150       2     4      1  117       1                  6       7   

    Marital status  Occupation type  ...  Main Source of Electricity  \
23               1                0  ...                         1.0   
34               1                0  ...                         1.0   
42               1                0  ...                         1.0   
46               1                0  ...                         1.0   
47               1                0  ...                         1.0   

    Income source  income type     wt_final          popw  \
23            101      

In [30]:
 # Define the poverty line threshold
poverty_line = 137430  # Define the poverty line

In [32]:
#Classify house hold as poor (1) and non Poor(0) based on total consumption per capital
df['poor'] = (df["Consumption per capita"] < poverty_line).astype(int)

In [33]:
# Select relevant features for predicting poverty
X=df[['hhsize', 'Income source', 'Educational Level', 'Spending on food Items', 'Spending on non food items']]
y=df['poor']

In [34]:
# Correct the feature list by removing the trailing comma
X = df[['hhsize', 'Income source', 'Educational Level', 'Spending on food Items', 'Spending on non food items']]


In [35]:
# Step 4: Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Step 5: Train the model (Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [37]:
# Step 6: Evaluate the model performance
y_pred = model.predict(X_test)

In [38]:
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


In [39]:
print("Model Evaluation:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Model Evaluation:
Accuracy: 0.9560
Precision: 0.9379
Recall: 0.9199
F1-Score: 0.9288
Confusion Matrix:
[[669  19]
 [ 25 287]]


In [40]:
def get_user_input_and_predict():
    # Accept user input for relevant features
    print("\nEnter the details of the household to predict the poverty:")
    hhsize = int(input("Enter household size: "))  # Collect input for household size
    income_source = int(input("Enter income source (numeric value): "))  # Collect input for income source
    educational_level = int(input("Enter educational level (numeric value): "))  # Collect input for education level
    spending_food = float(input("Enter spending on food items: "))  # Collect input for food spending
    spending_non_food = float(input("Enter spending on non-food items: "))  # Collect input for non-food spending
    return pd.DataFrame([[hhsize, income_source, educational_level, spending_food, spending_non_food]],
                        columns=['hhsize', 'Income source', 'Educational Level', 'Spending on food Items', 'Spending on non food items'])


In [41]:
user_input = get_user_input_and_predict()


Enter the details of the household to predict the poverty:
Enter household size: 5
Enter income source (numeric value): 100000
Enter educational level (numeric value): 5
Enter spending on food items: 10000
Enter spending on non-food items: 30000


In [19]:
print(user_input)

   hhsize  Income source  Educational Level  Spending on food Items  \
0       8           2000                  4                  3000.0   

   Spending on non food items  
0                      4000.0  


In [42]:
# Predict using the trained model
def predict_poverty(user_input):
    prediction = model.predict(user_input)
    prediction = prediction[0]
    print(prediction)
    if prediction == 1:
        print("The household is predicted to be BELOW the poverty line (Poor).")
    else:
        print("The household is predicted to be ABOVE the poverty line (Not Poor).")


In [43]:
predict_poverty(user_input)

1
The household is predicted to be BELOW the poverty line (Poor).
