# CE3 Project: Predicting Unemployment in Romania (2010-2013)
### Using Machine Learning Models: SVM and Neural Networks
**Objective**: Model the probability that a respondent is unemployed (`ILOSTAT == 2`) based on socio-demographic and occupational variables.
Data Source: EU Labour Force Survey (Romania, 2010-2013)

In [None]:
import pandas as pd

# Load and concatenate data
dfs = []
for year in range(2010, 2014):
    df = pd.read_csv(f'../data/raw/RO_LFS_{year}_Y.csv', low_memory=False)
    df['YEAR'] = year
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)
data.head()

In [None]:
# Final list of usable columns based on actual data
#selected_vars = ['AGE', 'SEX', 'HATLEV1D', 'DEGURBA', 'NACE1D', 'ISCO1D', 'DURUNE', 'ILOSTAT']
#selected_vars = ['AGE', 'SEX', 'HATLEV1D', 'DEGURBA', 'NACE1D', 'ISCO1D', 'ILOSTAT']
selected_vars = [
    'AGE',
    'SEX',
    'DEGURBA',
    'COUNTRPR',
    'YEARESID',
    'HATLEV1D',
    'NACE1D',
    'ISCO1D',
    'NUMJOB',
]

data = data[selected_vars + ['YEAR']]
data = data[data['ILOSTAT'].notna()]
data['TARGET'] = (data['ILOSTAT'] == 2).astype(int)
data = data.dropna()
data.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_vars = ['SEX', 'HATLEV1D', 'DEGURBA', 'NACE1D', 'ISCO1D']
for var in cat_vars:
    le = LabelEncoder()
    data[var] = le.fit_transform(data[var].astype(str))
data.head()

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

X = data.drop(columns=['ILOSTAT', 'TARGET'])
y = data['TARGET']

svm = SVC(kernel='linear')
rfe = RFE(svm, n_features_to_select=12)
X_rfe = rfe.fit_transform(X, y)
selected_features = X.columns[rfe.support_].tolist()
selected_features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_test, y_train, y_test = train_test_split(
    data[selected_features], y, test_size=0.3, random_state=42, stratify=y
)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
svm = SVC(C=10, gamma=0.01, kernel='rbf', probability=True)
svm.fit(X_train_scaled, y_train)
svm_preds = svm.predict(X_test_scaled)
svm_probs = svm.predict_proba(X_test_scaled)[:, 1]

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(patience=20, restore_best_weights=True)

model.fit(X_train_scaled, y_train, epochs=200, batch_size=32, validation_split=0.2, callbacks=[early_stopping])
nn_probs = model.predict(X_test_scaled).flatten()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

def evaluate_model(y_true, y_pred, y_probs, name="Model"):
    print(f"\n{name} Metrics")
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1-score:", f1_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_probs))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

    fpr, tpr, _ = roc_curve(y_true, y_probs)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_true, y_probs):.2f})')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend()

evaluate_model(y_test, svm_preds, svm_probs, "SVM")
evaluate_model(y_test, (nn_probs > 0.5).astype(int), nn_probs, "Neural Net")
plt.show()

## Next Steps
- Fill in the poster template using the results
- Highlight key variables with the strongest correlation to unemployment
- Formulate 2–3 public policy recommendations based on findings