In [1]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [2]:
folder_path = "*.csv"
# Read and merge all CSV files
files = glob.glob(folder_path)
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

print("Shape after merge:", df.shape)

Shape after merge: (77646, 20)


In [3]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77646 entries, 0 to 77645
Data columns (total 20 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Current Region       77631 non-null  object
 1   CurrentDistrict      77646 non-null  object
 2   Previous Region      77644 non-null  object
 3   PreviousDistrict     77644 non-null  object
 4   Arrival              77646 non-null  object
 5   ZoneName             77646 non-null  object
 6   OrganisationAcronym  77646 non-null  object
 7   0-4M                 77646 non-null  int64 
 8   0-4F                 77646 non-null  int64 
 9   5-11M                77646 non-null  int64 
 10  5-11F                77646 non-null  int64 
 11  12-17M               77646 non-null  int64 
 12  12-17F               77646 non-null  int64 
 13  18_59M               77646 non-null  int64 
 14  18_59F               77646 non-null  int64 
 15  60+M                 77646 non-null  int64 
 16  60+F

In [4]:
# Convert Arrival to datetime and extract useful features
df["Arrival"] = pd.to_datetime(df["Arrival"], errors="coerce")
df["Arrival_Year"] = df["Arrival"].dt.year
df["Arrival_Month"] = df["Arrival"].dt.month


In [5]:
# df = df.drop(columns=["OrganisationAcronym", "Category"])
df = df.drop(columns=["Arrival"])



In [6]:
df = df.rename(columns={
    "0-4M": "Age0_4_Male",
    "0-4F": "Age0_4_Female",
    "5-11M": "Age5_11_Male",
    "5-11F": "Age5_11_Female",
    "12-17M": "Age12_17_Male",
    "12-17F": "Age12_17_Female",
    "18_59M": "Age18_59_Male",
    "18_59F": "Age18_59_Female",
    "60+M": "Age60plus_Male",
    "60+F": "Age60plus_Female"
})


In [7]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77646 entries, 0 to 77645
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Current Region       77631 non-null  object 
 1   CurrentDistrict      77646 non-null  object 
 2   Previous Region      77644 non-null  object 
 3   PreviousDistrict     77644 non-null  object 
 4   ZoneName             77646 non-null  object 
 5   OrganisationAcronym  77646 non-null  object 
 6   Age0_4_Male          77646 non-null  int64  
 7   Age0_4_Female        77646 non-null  int64  
 8   Age5_11_Male         77646 non-null  int64  
 9   Age5_11_Female       77646 non-null  int64  
 10  Age12_17_Male        77646 non-null  int64  
 11  Age12_17_Female      77646 non-null  int64  
 12  Age18_59_Male        77646 non-null  int64  
 13  Age18_59_Female      77646 non-null  int64  
 14  Age60plus_Male       77646 non-null  int64  
 15  Age60plus_Female     77646 non-null 

In [8]:
df["Current Region"] = df["Current Region"].fillna(df["Current Region"].mode()[0])
df["Previous Region"] = df["Previous Region"].fillna(df["Previous Region"].mode()[0])
df["PreviousDistrict"] = df["PreviousDistrict"].fillna(df["Previous Region"].mode()[0])
df["Arrival_Year"] = df["Arrival_Year"].fillna(df["Arrival_Year"].mode()[0])
df["Arrival_Month"] = df["Arrival_Month"].fillna(df["Arrival_Month"].mode()[0])

In [9]:
df = df.drop_duplicates()

In [10]:
print(df["Individuals"].head())

0    5
1    6
2    6
3    4
4    6
Name: Individuals, dtype: int64


In [11]:
# Training data (Need is known)
train_df = df[df["Need"].notnull()].copy()
# Prediction data (Need is missing, to predict later)
predict_df = df[df["Need"].isnull()].copy()


In [12]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 60776 entries, 5 to 77645
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Current Region       60776 non-null  object 
 1   CurrentDistrict      60776 non-null  object 
 2   Previous Region      60776 non-null  object 
 3   PreviousDistrict     60776 non-null  object 
 4   ZoneName             60776 non-null  object 
 5   OrganisationAcronym  60776 non-null  object 
 6   Age0_4_Male          60776 non-null  int64  
 7   Age0_4_Female        60776 non-null  int64  
 8   Age5_11_Male         60776 non-null  int64  
 9   Age5_11_Female       60776 non-null  int64  
 10  Age12_17_Male        60776 non-null  int64  
 11  Age12_17_Female      60776 non-null  int64  
 12  Age18_59_Male        60776 non-null  int64  
 13  Age18_59_Female      60776 non-null  int64  
 14  Age60plus_Male       60776 non-null  int64  
 15  Age60plus_Female     60776 non-null  int6

In [13]:
print(train_df.shape)
print(predict_df.shape)

(60776, 21)
(2246, 21)


In [14]:
# Features and labels
X = train_df.drop(columns=["Need"])   
y = train_df["Need"]                 


In [15]:
# One-hot encode categorical variables
X_encoded = pd.get_dummies(
    X,
    columns=["Current Region", "CurrentDistrict", "Previous Region", "PreviousDistrict", "ZoneName"],
    drop_first=True  
).astype(int)


ValueError: invalid literal for int() with base 10: 'SEDHURO'

In [None]:
predict_encoded = pd.get_dummies(
    predict_df.drop(columns=["Need"]),
    columns=["Current Region", "CurrentDistrict", "Previous Region", "PreviousDistrict", "ZoneName"],
    drop_first=True
).astype(int)
# Align columns (this will make sure training and prediction sets have same structure)
predict_encoded = predict_encoded.reindex(columns=X_encoded.columns, fill_value=0)


In [18]:

X_train, X_val, y_train, y_val = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)


NameError: name 'X_encoded' is not defined

In [None]:
# Logistic Regression
lr = LogisticRegression(max_iter=5000, random_state=42,class_weight="balanced")
lr.fit(X_train, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42,class_weight="balanced")
rf.fit(X_train, y_train)
# LogisticRegression(class_weight="balanced")
# RandomForestClassifier(class_weight="balanced")


In [None]:
def evaluate(model, X_val, y_val, name):
    y_pred = model.predict(X_val)
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.3f}")
    print(f"Precision: {precision_score(y_val, y_pred, average='weighted'):.3f}")
    print(f"Recall: {recall_score(y_val, y_pred, average='weighted'):.3f}")
    print(f"F1 Score: {f1_score(y_val, y_pred, average='weighted'):.3f}")
    # print("Confusion Matrix:")
    # print(confusion_matrix(y_val, y_pred))

In [None]:
evaluate(lr, X_val, y_val, "Logistic Regression")
evaluate(rf, X_val, y_val, "Random Forest")

In [None]:
print(train_df["Need"].value_counts(dropna=False))


In [None]:
print(train_df.head())

In [None]:
print(train_df.info())

In [None]:
print(X.info())

In [None]:
print(X_encoded.info())

In [None]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
import numpy as np

# Define parameter grid for Random Forest
param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]
}

# StratifiedKFold is good for classification, especially with imbalanced data
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the randomized search object
rf_random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=30,  # Number of random combinations to try
    cv=cv_strategy,
    verbose=2,
    n_jobs=-1,
    scoring='f1_macro',  # Good metric for imbalanced multiclass
    random_state=42
)

# Fit the search to the data
rf_random_search.fit(X_train, y_train)

print("Best Parameters:", rf_random_search.best_params_)
print("Best CV Score (f1_macro):", rf_random_search.best_score_)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

# Retrieve the best model
best_rf = rf_random_search.best_estimator_

# Evaluate on test set
y_pred = best_rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [17]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

NameError: name 'y_test' is not defined