# Dataset link
https://www.kaggle.com/datasets/gokulrajkmv/unemployment-in-india

In [691]:
import pandas as pd

# Loading dataset

In [692]:
df=pd.read_csv("D:\\MECS\\Internships\\Oasis Infobyte\\Datasets\\Unemployment in India.csv")

In [693]:
df

Unnamed: 0,Region,Date,Frequency,Estimated Unemployment Rate (%),Estimated Employed,Estimated Labour Participation Rate (%),Area
0,Andhra Pradesh,31-05-2019,Monthly,3.65,11999139.0,43.24,Rural
1,Andhra Pradesh,30-06-2019,Monthly,3.05,11755881.0,42.05,Rural
2,Andhra Pradesh,31-07-2019,Monthly,3.75,12086707.0,43.50,Rural
3,Andhra Pradesh,31-08-2019,Monthly,3.32,12285693.0,43.97,Rural
4,Andhra Pradesh,30-09-2019,Monthly,5.17,12256762.0,44.68,Rural
...,...,...,...,...,...,...,...
763,,,,,,,
764,,,,,,,
765,,,,,,,
766,,,,,,,


In [694]:
df.isnull().sum()

Region                                      28
 Date                                       28
 Frequency                                  28
 Estimated Unemployment Rate (%)            28
 Estimated Employed                         28
 Estimated Labour Participation Rate (%)    28
Area                                        28
dtype: int64

In [695]:
df.dropna(axis=0,inplace=True)

In [696]:
df[' Frequency'].unique()

array([' Monthly', 'Monthly'], dtype=object)

In [697]:
df.drop(columns=[' Frequency'],inplace=True)

In [698]:
df.dtypes

Region                                       object
 Date                                        object
 Estimated Unemployment Rate (%)            float64
 Estimated Employed                         float64
 Estimated Labour Participation Rate (%)    float64
Area                                         object
dtype: object

## Converting the datatype of Date attribute

In [699]:
df[' Date'] = pd.to_datetime(df[' Date'], format=' %d-%m-%Y')
df[' Date'] = df[' Date'].dt.strftime('%Y%m%d.0')
df[' Date']=df[' Date'].astype(float)

In [700]:
nums=df.select_dtypes(include='number')

# Checking for outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
for i in nums.columns:
    sns.boxplot(x=nums[i]) #for plotting boxplot of each attribute
    plt.title("Boxplot for :"  +i) #for giving name of the boxplot
    plt.show() #for showing the plot

## Removing outliers

In [None]:
for it in nums.columns:  
  Q1=nums[it].quantile(0.25) #25% quantile of the attribute
  Q3=nums[it].quantile(0.75) #75% quantile of the attribute
  IQR=Q3 - Q1 #interquartile range(difference of q1 and q3)
  lower_bound=Q1 - 1.5 * IQR #lower bound of the attribute
  upper_bound=Q3 + 1.5 * IQR #upper bound of the attribute


  lst=[] #list to store the non outliers
  for i in nums[it]: #checking all values in attribute
      if  (i>upper_bound):
            lst.append(upper_bound) #if number greater than upperbound then append previous upper bound
      elif (i<lower_bound):
          lst.append(lower_bound) #if number less than lower bound then append previous lower bound
      else:
          lst.append(i) # if number is between upper and lower append into list

  nums[it]=lst #update the column with non outliers

# Encoding Categorical varibles

In [None]:
cat=df.select_dtypes(exclude='number')

In [None]:
from sklearn import preprocessing
ordi = preprocessing.OrdinalEncoder()
new_cat = pd.DataFrame(ordi.fit_transform(cat),columns=cat.columns,index=cat.index)
new_cat

In [None]:
import pickle
pickle.dump(ordi,open('encoder.pkl','wb'))

# Standardizing the numerical attributes

In [None]:
from sklearn.preprocessing import StandardScaler
nums.drop(columns=[' Estimated Unemployment Rate (%)'],inplace=True)

scaler = StandardScaler()
std_num= pd.DataFrame(scaler.fit_transform(nums), 
                                    columns = nums.columns, 
                                    index = nums.index)

In [None]:
pickle.dump(scaler,open('scaling.pkl','wb'))

In [None]:
new_df=pd.concat([new_cat,std_num,df[' Estimated Unemployment Rate (%)']],axis=1)

In [None]:
new_df

In [None]:
plt.figure(figsize=(10,8))
corr = new_df.corr()
sns.heatmap(corr, annot=True)
plt.title("Features Correlation", fontsize = 15)
plt.show()

# Splitting for training and testing

In [None]:
x=new_df.drop(columns=[' Estimated Unemployment Rate (%)'])
y=new_df[' Estimated Unemployment Rate (%)']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=100)

# Linear, lasso and ridge regression

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

# Linear Regression
regressor = LinearRegression()
# Lasso Regression
lasso = Lasso()
# Ridge Regression
ridge = Ridge()

# Fitting models on Training data 
regressor.fit(X_train, y_train)
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)

# Making predictions on Test data
y_pred_lr = regressor.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_ridge = ridge.predict(X_test)

In [None]:
from sklearn import metrics
import numpy as np

print("Model\t\t\t RMSE \t\t MSE \tMAE \t\t MAPE \t\t\t R2")
print("""LinearRegression \t {:.2f} \t\t {:.2f} \t{:.2f} \t{:.2f} \t\t{:.2f}""".format(
            np.sqrt(metrics.mean_squared_error(y_test, y_pred_lr)),metrics.mean_squared_error(y_test, y_pred_lr),metrics.mean_absolute_error(y_test, y_pred_lr),metrics.mean_absolute_percentage_error(y_test, y_pred_lr),metrics.r2_score(y_test, y_pred_lr)))
print("""LassoRegression \t {:.2f} \t\t {:.2f} \t{:.2f} \t{:.2f} \t\t{:.2f}""".format(
            np.sqrt(metrics.mean_squared_error(y_test, y_pred_lasso)),metrics.mean_squared_error(y_test, y_pred_lasso),metrics.mean_absolute_error(y_test, y_pred_lasso),metrics.mean_absolute_percentage_error(y_test, y_pred_lasso), metrics.r2_score(y_test, y_pred_lasso)))
print("""RidgeRegression \t {:.2f} \t\t {:.2f} \t{:.2f} \t{:.2f} \t\t{:.2f}""".format(
            np.sqrt(metrics.mean_squared_error(y_test, y_pred_ridge)),metrics.mean_squared_error(y_test, y_pred_ridge),metrics.mean_absolute_error(y_test, y_pred_ridge),metrics.mean_absolute_percentage_error(y_test, y_pred_ridge), metrics.r2_score(y_test, y_pred_ridge)))

# KNN regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor
regressor_knn = KNeighborsRegressor()
regressor_knn.fit(X_train, y_train)

In [None]:
y_test_pred = regressor_knn.predict(X_test)

In [None]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
#showing only 5 rows
temp_df.head()

In [None]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

print('Mean Absolute percentage Error: ', metrics.mean_absolute_percentage_error(y_test, y_test_pred))

print("R-Square is",metrics.r2_score(y_test,y_test_pred))

# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor()
regressor_dt.fit(X_train, y_train)

In [None]:
y_test_pred = regressor_dt.predict(X_test)

In [None]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})
#showing only 5 tuples
temp_df.head()

In [None]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

print('Mean Absolute percentage Error: ', metrics.mean_absolute_percentage_error(y_test, y_test_pred))

print("R-Square is",metrics.r2_score(y_test,y_test_pred))

# Random Forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor()
regressor_rf.fit(X_train, y_train)

In [None]:
y_test_pred = regressor_rf.predict(X_test)

In [None]:
temp_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred})

temp_df.head()

In [None]:
print('Mean Absolute Error: ', metrics.mean_absolute_error(y_test, y_test_pred))

print('Mean Squared Error: ', metrics.mean_squared_error(y_test, y_test_pred))

print('Root Mean Squared Error: ', np.sqrt(metrics.mean_squared_error(y_test, y_test_pred)))

print('Mean Absolute percentage Error: ', metrics.mean_absolute_percentage_error(y_test, y_test_pred))

print("R-Square is",metrics.r2_score(y_test,y_test_pred))

# Plotting accuracies

In [None]:
models = [regressor, lasso, ridge, regressor_knn, regressor_dt,regressor_rf]#storing all models in the list
names = ["Linear Regression", "Lasso Regression", "Ridge Regression", 
         "KNN Regressor","Decision Tree regressor","Random Forest Regressor"]#storing models names in the list
r2 = []#list to store the r2scores

for model in models:
    r2.append( (metrics.r2_score(y_test, model.predict(X_test)))) # append into the r2 values list

plt.plot(names,r2,'b-o',label='Accuracy over Different models');#plotting r2 with the line graph
plt.xticks(rotation=90,ha='right')#marking xticks with the model names
plt.xlabel('Modal')#labelling x axis
plt.ylabel('Accuracy')#labelling y axis

plt.show()#for showing the plot

# Among all algorithms Random forest regressor got more accuracy of 58%

# Pickling the algorithm(random forest)

In [None]:
pickle.dump(regressor_rf,open('rf.pkl','wb'))

In [None]:
model=pickle.load(open('rf.pkl','rb'))

In [None]:
pred=X_test.head(1)

In [None]:
model.predict(pred)

In [None]:
y_test.head(1)

In [None]:
encoderr=pickle.load(open('encoder.pkl','rb'))

In [None]:
encoderr.transform([['Andhra Pradesh','Rural']])

In [None]:
scaler=pickle.load(open('scaling.pkl','rb'))

In [None]:
scaler.transform([[20190531.0,11999139.0,43.24]])