In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR  # Importing Support Vector Regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

file_path = r"C:\Users\ABHAY\OneDrive\Desktop\Migration Prediction.csv"

# Loading dataset
df = pd.read_csv(file_path, skiprows=2)

# Renaming column names for ease
df.columns = ['Country', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
df = df[~df['Country'].str.contains('% of foreign population', na=False)]
df['Country'] = df['Country'].str.strip()  # Removing extra spaces
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric, errors='coerce')  # Convert numeric columns

# Drop rows with missing values
df.dropna(inplace=True)

print("Cleaned DataFrame:")
print(df.head())

# Saving cleaned data 
df.to_csv("cleaned_data.csv", index=False)

# Training model and features (X) and target (y)
X = df.drop(columns=["Country", "2021"])
y = df["2021"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Training Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Training Support Vector Regression (SVR)
svr_model = SVR(kernel='rbf')  # Using Radial Basis Function kernel
svr_model.fit(X_train, y_train)

# Evaluate models
rf_y_pred = rf_model.predict(X_test)
lr_y_pred = lr_model.predict(X_test)
svr_y_pred = svr_model.predict(X_test)

# Metrics for Random Forest
rf_mae = mean_absolute_error(y_test, rf_y_pred)
rf_rmse = mean_squared_error(y_test, rf_y_pred, squared=False)
rf_r2 = r2_score(y_test, rf_y_pred)

# Metrics for Linear Regression
lr_mae = mean_absolute_error(y_test, lr_y_pred)
lr_rmse = mean_squared_error(y_test, lr_y_pred, squared=False)
lr_r2 = r2_score(y_test, lr_y_pred)

# Metrics for SVR
svr_mae = mean_absolute_error(y_test, svr_y_pred)
svr_rmse = mean_squared_error(y_test, svr_y_pred, squared=False)
svr_r2 = r2_score(y_test, svr_y_pred)

print(f"Random Forest - MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R²: {rf_r2:.2f}")
print(f"Linear Regression - MAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}, R²: {lr_r2:.2f}")
print(f"Support Vector Regression - MAE: {svr_mae:.2f}, RMSE: {svr_rmse:.2f}, R²: {svr_r2:.2f}")

# Saving the trained models
joblib.dump(rf_model, 'rf_migration_model.pkl')
joblib.dump(lr_model, 'lr_migration_model.pkl')
joblib.dump(svr_model, 'svr_migration_model.pkl')

# Predicting for a new data point with each model
new_data = pd.DataFrame({
    '2011': [5000],
    '2012': [6000],
    '2013': [7000],
    '2014': [8000],
    '2015': [9000],
    '2016': [10000],
    '2017': [11000],
    '2018': [12000],
    '2019': [13000],
    '2020': [14000],
})

rf_new_prediction = rf_model.predict(new_data)
lr_new_prediction = lr_model.predict(new_data)
svr_new_prediction = svr_model.predict(new_data)

# Averaging predictions to get a single output
final_prediction = np.mean([rf_new_prediction[0], lr_new_prediction[0], svr_new_prediction[0]])

print(f"Final Predicted migration for 2021 (Averaged Prediction): {final_prediction:.2f}")

# Predict migration for a specific country
country_name = input("Enter the country name: ").strip().lower()

# Check if the country exists in the dataset
if country_name in df['Country'].str.lower().values:
    # Extract data for the selected country
    country_data = df[df['Country'].str.lower() == country_name].iloc[0, 1:].values.astype(float)
    years = np.array(range(2011, 2022)).reshape(-1, 1)  # years from 2011 to 2021

    # Train the Linear Regression model
    regressor = LinearRegression()
    regressor.fit(years, country_data)

    future_year = input("Enter the year: ").strip()

    try:
        future_year = float(future_year)
        future_prediction = regressor.predict([[future_year]])

        print(f"Predicted migration for {country_name.capitalize()} in {int(future_year)}: {future_prediction[0]:.2f}")

    except ValueError:
        # Handle non-numerical input for the year
        print("Please enter a valid numerical year.")
else:
    # Country not found in the dataset
    print("Country not found in the dataset.")