In [None]:
import pandas as pd

# Load the CSV files
file1 = pd.read_csv("/content/Unemployment in India.csv")
file2 = pd.read_csv("/content/Unemployment_Rate_upto_11_2020.csv")

# Clean column names
file1.columns = file1.columns.str.strip()
file2.columns = file2.columns.str.strip()

# Convert 'Date' columns to datetime
file1['Date'] = pd.to_datetime(file1['Date'], dayfirst=True, errors='coerce')
file2['Date'] = pd.to_datetime(file2['Date'], dayfirst=True, errors='coerce')

# Merge on 'Region' and 'Date'
merged_df = pd.merge(file1, file2, on=['Region', 'Date'], how='outer', suffixes=('_file1', '_file2'))

# Rename columns (optional but helpful)
merged_df = merged_df.rename(columns={
    "Estimated Unemployment Rate (%)_file1": "Unemployment Rate (File1)",
    "Estimated Unemployment Rate (%)_file2": "Unemployment Rate (File2)",
    "Estimated Employed_file1": "Employed (File1)",
    "Estimated Employed_file2": "Employed (File2)",
    "Estimated Labour Participation Rate (%)_file1": "Labour Participation Rate (File1)",
    "Estimated Labour Participation Rate (%)_file2": "Labour Participation Rate (File2)"
})

# Save to a new CSV file
merged_df.to_csv("merged_unemployment_data.csv", index=False)

print("✅ Merged file created: merged_unemployment_data.csv")


✅ Merged file created: merged_unemployment_data.csv


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Load data
df = pd.read_csv("/content/merged_unemployment_data.csv")
df = df[['Region', 'Date', 'Unemployment Rate (File1)', 'Employed (File1)', 'Labour Participation Rate (File1)']]
df.dropna(inplace=True)

# Feature Engineering
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Region'] = LabelEncoder().fit_transform(df['Region'])

# Features and Target
X = df[['Region', 'Month', 'Year', 'Employed (File1)', 'Labour Participation Rate (File1)']]
y = df['Unemployment Rate (File1)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "Support Vector Regressor": SVR(),
    "KNN": KNeighborsRegressor()
}

# Evaluation
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"\n{name}")
    print("MAE:", mean_absolute_error(y_test, preds))
    print("MSE:", mean_squared_error(y_test, preds))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, preds)))
    print("R² Score:", r2_score(y_test, preds))



Linear Regression
MAE: 7.833878615256611
MSE: 129.27336592173006
RMSE: 11.369844586524922
R² Score: 0.05394995908935751

Decision Tree
MAE: 4.775472972972973
MSE: 88.87384662162162
RMSE: 9.427292645379246
R² Score: 0.349602250759233

Random Forest
MAE: 4.158274324324324
MSE: 59.71671387594592
RMSE: 7.727659016542197
R² Score: 0.5629803617893475

Gradient Boosting
MAE: 4.562774190936859
MSE: 59.9290251945271
RMSE: 7.741383932768552
R² Score: 0.5614266223148827

Support Vector Regressor
MAE: 7.7689068457759705
MSE: 145.83780599569894
RMSE: 12.076332472886747
R² Score: -0.06727214337471987

KNN
MAE: 7.747932432432433
MSE: 134.64101927027028
RMSE: 11.603491684414234
R² Score: 0.014668327998733854
