# **Mental Health Dataset of Nepal**

## Importing All Necessary Libraries

In [None]:
import pandas as pd # used for analyzing, cleaning, exploring, and manipulating the data
import numpy as np # used for numerical computing
import plotly.express as px # for data visualation
from sklearn.preprocessing import LabelEncoder # for encoding categorical values to numnerical values
from sklearn.model_selection import train_test_split # for splitting the dataset into training and testing sets
from sklearn.ensemble import RandomForestClassifier # used for solving classification problem
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # for measuring the performance and effectiveness of algorithms
from sklearn.neighbors import KNeighborsClassifier # used for K-Nearest Neighbors Algorithm
from sklearn.linear_model import LogisticRegression # used for logistic regression classification

## Data Exploration

In [None]:
# loading csv file into df (dataframe) for further operations
df = pd.read_csv('/content/drive/MyDrive/Data Analytics/Mental_health_dataset.csv')

In [None]:
df

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,Female,Age,Married,Unmarried,Education,Employment,lat,long
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Dipression,26,24,19,27,23,"Some College, short continuing education or eq...",no,27.618589,87.856661
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26,,yes,27.618589,87.856661
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety (Neurosis),24,32,21,37,19,#####,yes,27.618589,87.856661
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43,"College degree, bachelor, master",yes,27.618589,87.856661
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Conversive disorder (Hysteria),49,29,60,45,33,#####,retired,27.618589,87.856661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2620,2620,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Minor,Anxiety (Neurosis),47,41,53,47,41,"College degree, bachelor, master",yes,29.892711,80.741361
2621,2621,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Mental retardation,18,22,38,24,16,"College degree, bachelor, master",no,29.892711,80.741361
2622,2622,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Conversive disorder (Hysteria),27,30,18,5,52,Up to 12 years of school,no,29.892711,80.741361
2623,2623,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Normal,Alcoholism,55,36,60,33,58,"College degree, bachelor, master",#####,29.892711,80.741361


In [None]:
df.info() # for viewing information such as column labels, no of columns, data types, etc of a dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2625 entries, 0 to 2624
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   S.N                 2625 non-null   int64  
 1   District_Name       2625 non-null   object 
 2   Zone                2625 non-null   object 
 3   Ecological_Belt     2625 non-null   object 
 4   Development_Region  2625 non-null   object 
 5   Year_BS             2625 non-null   int64  
 6   Year_AD             2625 non-null   int64  
 7   condition           2625 non-null   object 
 8   type                2625 non-null   object 
 9   Male                2625 non-null   int64  
 10  Female              2625 non-null   int64  
 11  Age                 2625 non-null   int64  
 12  Married             2625 non-null   int64  
 13  Unmarried           2625 non-null   int64  
 14  Education           2625 non-null   object 
 15  Employment          2625 non-null   object 
 16  lat   

In [None]:
df.head() # for checking the specifed number of rows, from the top of dataframe

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,Female,Age,Married,Unmarried,Education,Employment,lat,long
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Dipression,26,24,19,27,23,"Some College, short continuing education or eq...",no,27.618589,87.856661
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26,,yes,27.618589,87.856661
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety (Neurosis),24,32,21,37,19,#####,yes,27.618589,87.856661
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43,"College degree, bachelor, master",yes,27.618589,87.856661
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Conversive disorder (Hysteria),49,29,60,45,33,#####,retired,27.618589,87.856661


In [None]:
df.tail() # for checking the specified number of rows, form the bottom of the dataframe

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,condition,type,Male,Female,Age,Married,Unmarried,Education,Employment,lat,long
2620,2620,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Minor,Anxiety (Neurosis),47,41,53,47,41,"College degree, bachelor, master",yes,29.892711,80.741361
2621,2621,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Mental retardation,18,22,38,24,16,"College degree, bachelor, master",no,29.892711,80.741361
2622,2622,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Conversive disorder (Hysteria),27,30,18,5,52,Up to 12 years of school,no,29.892711,80.741361
2623,2623,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Normal,Alcoholism,55,36,60,33,58,"College degree, bachelor, master",#####,29.892711,80.741361
2624,2624,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Normal,Epilesy,39,30,24,48,21,"College degree, bachelor, master",yes,29.892711,80.741361


In [None]:
df.describe()

Unnamed: 0,S.N,Year_BS,Year_AD,Male,Female,Age,Married,Unmarried,lat,long
count,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0,2625.0
mean,1312.0,2071.862857,2014.862857,39.217143,38.765333,38.762286,37.357333,40.625143,28.003789,84.254905
std,757.916552,2.568618,2.568618,12.639741,12.423508,12.513433,13.006108,21.568092,0.885817,2.170864
min,0.0,2069.0,2012.0,18.0,18.0,18.0,1.0,-6.0,26.571975,80.38209
25%,656.0,2070.0,2013.0,28.0,28.0,28.0,27.0,25.0,27.21832,82.359404
50%,1312.0,2071.0,2014.0,39.0,38.0,39.0,38.0,40.0,27.948879,84.235742
75%,1968.0,2074.0,2017.0,51.0,50.0,50.0,48.0,55.0,28.692376,86.00722
max,2624.0,2077.0,2020.0,60.0,60.0,60.0,60.0,108.0,30.035133,87.955321


In [None]:
df.dtypes

S.N                     int64
District_Name          object
Zone                   object
Ecological_Belt        object
Development_Region     object
Year_BS                 int64
Year_AD                 int64
condition              object
type                   object
Male                    int64
Female                  int64
Age                     int64
Married                 int64
Unmarried               int64
Education              object
Employment             object
lat                   float64
long                  float64
dtype: object

## Preprocessing Concept, And Knowledge

In [None]:
null_values = df.isnull().sum() # Checking null values if there is in a dataframe

print("No of missing values in each columns:")
print(null_values)

No of missing values in each columns:
S.N                   0
District_Name         0
Zone                  0
Ecological_Belt       0
Development_Region    0
Year_BS               0
Year_AD               0
condition             0
type                  0
Male                  0
Female                0
Age                   0
Married               0
Unmarried             0
Education             0
Employment            0
lat                   0
long                  0
dtype: int64


In [None]:
# Renaming columns to have a constant pattern of writing columns name
new_column_names = {'condition': 'Condition',
                    'type': 'Type',
                    'lat': 'Lat',
                    'long': 'Long'}
df.rename(columns = new_column_names, inplace = True)

print("Updated column names:")
print(df.columns)

Updated column names:
Index(['S.N', 'District_Name', 'Zone', 'Ecological_Belt', 'Development_Region',
       'Year_BS', 'Year_AD', 'Condition', 'Type', 'Male', 'Female', 'Age',
       'Married', 'Unmarried', 'Education', 'Employment', 'Lat', 'Long'],
      dtype='object')


In [None]:
duplicate_rows = df.duplicated()

# Printing rows marked as duplicates if any
print("Duplicate rows:")
print(df[duplicate_rows])

Duplicate rows:
Empty DataFrame
Columns: [S.N, District_Name, Zone, Ecological_Belt, Development_Region, Year_BS, Year_AD, Condition, Type, Male, Female, Age, Married, Unmarried, Education, Employment, Lat, Long]
Index: []


In [None]:
noOfDistrict = df['District_Name'].nunique() # finding total counties studided in the dataset
print("Total No of District Studied:", noOfDistrict)

Total No of District Studied: 75


In [None]:
noofType = df['Type'].nunique() # finding the number of types in Type columns
print("No of type in Type column:")
print(noofType)

No of type in Type column:
7


In [None]:
#checking unique value of each column of dataframe
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column '{column}' has {len(unique_values)} unique values:")
    print(unique_values)
    print("\n")

Column 'S.N' has 2625 unique values:
[   0    1    2 ... 2622 2623 2624]


Column 'District_Name' has 75 unique values:
['Taplejung' 'Panchthar' 'Ilam' 'Jhapa' 'Morang' 'Sunsari' 'Dhankuta'
 'Teharthum' 'Sankhuwasabha' 'Bhojpur' 'Solukhumbu' 'Okhaldhunga'
 'Khotang' 'Udaypur' 'Saptari' 'Siraha' 'Dhanusha' 'Mahottari' 'Sarlahi'
 'Sindhuli' 'Ramechhap' 'Dolkha' 'Sindhupalchowk' 'Kavre' 'Lalitpur'
 'Bhaktapur' 'Kathmandu' 'Nuwakot' 'Rasuwa' 'Dhading' 'Makawanpur'
 'Rautahat' 'Bara' 'Parsa' 'Chitwan' 'Gorkha' 'Lamjung' 'Tanahu' 'Syangja'
 'Kaski' 'Manang' 'Mustang' 'Myagdi' 'Parbat' 'Baglung' 'Gulmi' 'Palpa'
 'Nawalparasi' 'Rupandehi' 'Kapilvastu' 'Arghakhanchi' 'Pyuthan' 'Rolpa'
 'Rukum' 'Salyan' 'Dang' 'Banke' 'Bardiya' 'Surkhet' 'Dailekh' 'Jajarkot'
 'Dolpa' 'Jumla' 'Kalikot' 'Mugu' 'Humla' 'Bajura' 'Bajhang' 'Achham'
 'Doti' 'Kailali' 'Kanchanpur' 'Dadeldhura' 'Baitadi' 'Darchula']


Column 'Zone' has 14 unique values:
['Mechi' 'Kosi' 'Sagarmatha' 'Janakpur' 'Bagmati' 'Narayani' 'Ganda

In [None]:
# Replacing negative values with null
df['Unmarried'] = np.where(df['Unmarried'] < 0, np.nan, df['Unmarried'])
print(df['Unmarried'].unique())

nan_count_Unmarried = df['Unmarried'].isna().sum()
print('Total number of nan in Unmarried column: ')
print(nan_count_Unmarried)

[ 23.  26.  19.  43.  33.   1.  55.  80.  69.  32.  16.  56.  79.   8.
  11.  76.  61.  41.  59.  35.  34.  48.  38.  18.  10.  31.  22.  12.
  30.   0.  71.  66.  nan  97.  50.  14.   7.  39.   2.  77.  47.  13.
  49.   9.  37.  42.  28.  21.   6.  62.  63.   3.  45.  17.  60.  84.
  65.  57.  58.  73.  44.  20.  72.  82.  15. 108.  51.  75.  36.  89.
  40.  78.  24.  70.  29.  52.  54.  83.  46.  53.  25.  87.  27.  68.
  64.  85.  81.   5.  91. 105. 102.  94.  92.  86.  74.  67.  90.  99.
   4. 107.]
Total number of nan in Unmarried column: 
7


In [None]:
# Calculating the average of a Unmarried column
averageofUnmarried = df['Unmarried'].mean()

print(f"The average of Unmarried column is: {averageofUnmarried}")

The average of Unmarried column is: 40.74293353705119


In [None]:
# Now replacing Nan value in Unmarried column with average value of Unmarried column
df['Unmarried'].fillna(averageofUnmarried, inplace=True)

nan_count_Unmarried = df['Unmarried'].isna().sum()
print('Total number of nan after replacing nan value with average of Unmarried:')
print(nan_count_Unmarried)

Total number of nan after replacing nan value with average of Unmarried:
0


In [None]:
# Spelling correction of column type value
df['Type'] = df['Type'].str.replace('Dipression', 'Depression')
df['Type'] = df['Type'].str.replace('Epilesy', 'Epilepsy')

In [None]:
# correcting the value of type column where there were two different types in a row
df['Type'] = df['Type'].replace('Anxiety (Neurosis)', 'Anxiety')
df['Type'] = df['Type'].replace('Conversive disorder (Hysteria)','Hysteria')

In [None]:
# Replacing the unknown value of the columns with N/A
df["Education"] = df["Education"].replace(['####', '#####', '######', '#######', 'na/na', 'Na/na'], np.nan) # replacing irrelevant values with nan
nan_count_Education = df['Education'].isna().sum()
print(f"Total NaN count of Education column: {nan_count_Education}")
df["Employment"] = df["Employment"].replace(['####', '#####', '######', '#######'], np.nan)
nan_count_Employment = df['Employment'].isna().sum()
print(f"Total NaN count of Employment column: {nan_count_Employment}")
df

Total NaN count of Education column: 177
Total NaN count of Employment column: 360


Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,Condition,Type,Male,Female,Age,Married,Unmarried,Education,Employment,Lat,Long
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Depression,26,24,19,27,23.0,"Some College, short continuing education or eq...",no,27.618589,87.856661
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26.0,,yes,27.618589,87.856661
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety,24,32,21,37,19.0,,yes,27.618589,87.856661
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43.0,"College degree, bachelor, master",yes,27.618589,87.856661
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Hysteria,49,29,60,45,33.0,,retired,27.618589,87.856661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2620,2620,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Minor,Anxiety,47,41,53,47,41.0,"College degree, bachelor, master",yes,29.892711,80.741361
2621,2621,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Mental retardation,18,22,38,24,16.0,"College degree, bachelor, master",no,29.892711,80.741361
2622,2622,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Severe,Hysteria,27,30,18,5,52.0,Up to 12 years of school,no,29.892711,80.741361
2623,2623,Darchula,Mahakali,Mountain,Far-Western,2077,2020,Normal,Alcoholism,55,36,60,33,58.0,"College degree, bachelor, master",,29.892711,80.741361


In [None]:
df['Education'] = df['Education'].replace('none', 'None') # replacing every 'none' with 'None' for consitency

In [None]:
df.fillna('Unknown', inplace=True) # filling NaN value of the dataframe with value 'Unknown'

In [None]:
df.head()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,Condition,Type,Male,Female,Age,Married,Unmarried,Education,Employment,Lat,Long
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Depression,26,24,19,27,23.0,"Some College, short continuing education or eq...",no,27.618589,87.856661
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26.0,,yes,27.618589,87.856661
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety,24,32,21,37,19.0,Unknown,yes,27.618589,87.856661
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43.0,"College degree, bachelor, master",yes,27.618589,87.856661
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Hysteria,49,29,60,45,33.0,Unknown,retired,27.618589,87.856661


In [None]:
bins = [0,20,40,float('inf')] # grouping the age into young, Middle-age, and old
labels = ['young', 'Middle-age', 'old']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
df.head()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,Condition,Type,Male,Female,Age,Married,Unmarried,Education,Employment,Lat,Long,Age_Group
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Depression,26,24,19,27,23.0,"Some College, short continuing education or eq...",no,27.618589,87.856661,young
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,30,50,57,26.0,,yes,27.618589,87.856661,old
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety,24,32,21,37,19.0,Unknown,yes,27.618589,87.856661,Middle-age
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,46,20,51,43.0,"College degree, bachelor, master",yes,27.618589,87.856661,Middle-age
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Hysteria,49,29,60,45,33.0,Unknown,retired,27.618589,87.856661,old


In [None]:
df.info

<bound method DataFrame.info of        S.N District_Name      Zone Ecological_Belt Development_Region  \
0        0     Taplejung     Mechi        Mountain            Eastern   
1        1     Taplejung     Mechi        Mountain            Eastern   
2        2     Taplejung     Mechi        Mountain            Eastern   
3        3     Taplejung     Mechi        Mountain            Eastern   
4        4     Taplejung     Mechi        Mountain            Eastern   
...    ...           ...       ...             ...                ...   
2620  2620      Darchula  Mahakali        Mountain        Far-Western   
2621  2621      Darchula  Mahakali        Mountain        Far-Western   
2622  2622      Darchula  Mahakali        Mountain        Far-Western   
2623  2623      Darchula  Mahakali        Mountain        Far-Western   
2624  2624      Darchula  Mahakali        Mountain        Far-Western   

      Year_BS  Year_AD Condition                Type  Male  Female  Age  \
0        2069   

In [None]:
# Filter data for severe conditions only
severe_df = df[df["Condition"] == "Severe"]

# Calculate the total number of severe conditions in each district
severe_counts = severe_df["District_Name"].value_counts().reset_index()
severe_counts.columns = ["District_Name", "Severe_Count"]

# Merge severe counts with severe_df to get lat and long
severe_df = pd.merge(severe_df, severe_counts, on="District_Name")

# Create the map using Plotly Express
fig = px.scatter_mapbox(severe_df, lat="Lat", lon="Long", hover_name="District_Name", hover_data=["Severe_Count"],
                        color="Severe_Count", size="Severe_Count", color_continuous_scale=px.colors.sequential.Reds,
                        size_max=40, zoom=5, mapbox_style="open-street-map",
                        title="Districts with Severe Conditions and Total Severities")

# Show the plot
fig.show()

In [None]:
# Creating the pie chart using Plotly Express
fig = px.pie(df, names='Condition', title='Distribution of Conditions')

fig.show()

In [None]:
fig = px.histogram(df, x='Condition', nbins=100, title='Condition Distribution') # visualizing the distribution of each condition
fig.show()

In [None]:
# Box Plot
# visualizing distribution of age by employment
fig = px.box(df, x='Employment', y='Age', title='Age distribution by Employment')
fig.show()

In [None]:
# Box Plot
# visualizing age distribution by education seperated by color as condition
fig = px.box(df, x='Type', y='Age', color='Condition', title='Age distribution by Education and Condition')
fig.show()

In [None]:
# Box Plot

fig = px.box(df, x='Education', y='Age', color='Condition', title='Age distribution by Education')
fig.show()

In [None]:
# Box Plot
fig = px.box(df, x='Type', y='Age', title='Age Distribution by Mental Health Type')
fig.show()

In [None]:
# Histogram
fig = px.histogram(df, x='Age', nbins=90, title='Age Distribution')
fig.show()

In [None]:
# Scatter Plot
fig = px.scatter(df, x='Education', y='Age', color='Type', title='Scatter Plot of Age and Education')
fig.show()

In [None]:
# exporting the clean dataset
df.to_csv('/content/drive/MyDrive/Data Analytics/Mental_Health_Dataset_After_Cleaned.csv', index=False)

In [None]:
district_distribution = df['District_Name'].value_counts() # finding the frequency of each district in the dataframe
print("Frequency distribution of mental health phenomena by district:\n")
print(district_distribution)

Frequency distribution of mental health phenomena by district:

Taplejung         35
Banke             35
Salyan            35
Rukum             35
Rolpa             35
                  ..
Lalitpur          35
Kavre             35
Sindhupalchowk    35
Dolkha            35
Darchula          35
Name: District_Name, Length: 75, dtype: int64


In [None]:
# finding district that has the highest frequency in dataframe and its frequency count
max_district = district_distribution.idxmax()
max_count = district_distribution.max()

print("\nThe district with the highest number of mental health phenomena is:", max_district)
print("Number of mental health phenomena in this district:", max_count)


The district with the highest number of mental health phenomena is: Taplejung
Number of mental health phenomena in this district: 35


In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the condition column
df['encoded_Conditions'] = label_encoder.fit_transform(df['Condition'])
df['encoded_Types'] = label_encoder.fit_transform(df['Type'])
df['encoded_Education'] = label_encoder.fit_transform(df['Education'])
df['encoded_Employment'] = label_encoder.fit_transform(df['Employment'])
df.head()

Unnamed: 0,S.N,District_Name,Zone,Ecological_Belt,Development_Region,Year_BS,Year_AD,Condition,Type,Male,...,Unmarried,Education,Employment,Lat,Long,Age_Group,encoded_Conditions,encoded_Types,encoded_Education,encoded_Employment
0,0,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Depression,26,...,23.0,"Some College, short continuing education or eq...",no,27.618589,87.856661,young,3,2,3,1
1,1,Taplejung,Mechi,Mountain,Eastern,2069,2012,Severe,Psychosis,53,...,26.0,,yes,27.618589,87.856661,old,3,6,1,3
2,2,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Anxiety,24,...,19.0,Unknown,yes,27.618589,87.856661,Middle-age,0,1,4,3
3,3,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Mental retardation,48,...,43.0,"College degree, bachelor, master",yes,27.618589,87.856661,Middle-age,0,5,0,3
4,4,Taplejung,Mechi,Mountain,Eastern,2069,2012,Major,Hysteria,49,...,33.0,Unknown,retired,27.618589,87.856661,old,0,4,4,2


## Correlation Matrix

In [None]:
correlation_matrix = df.corr() # Creating a correlation matrix
condition_correlation = correlation_matrix['encoded_Conditions']

# Printing the correlation values
print(condition_correlation)

S.N                   0.018953
Year_BS               0.020503
Year_AD               0.020503
Male                 -0.005553
Female                0.004533
Age                  -0.024765
Married              -0.007749
Unmarried             0.002816
Lat                   0.033840
Long                 -0.032860
encoded_Conditions    1.000000
encoded_Types        -0.014775
encoded_Education     0.007947
encoded_Employment   -0.013090
Name: encoded_Conditions, dtype: float64






In [None]:
# showing correlation of dataset in heatmap
heatmap_fig = px.imshow(
    correlation_matrix,
    x = correlation_matrix.columns,
    y = correlation_matrix.columns,
)
heatmap_fig.update_layout(title = "Correlation Matrix")
heatmap_fig.show()

## Data Modeling & Development

In [None]:
# Initializing the dependent and independent variable for the model
X=df[['Age','encoded_Types','encoded_Education','encoded_Employment',
      'Male','Female','Married','Unmarried']]
y=df["Condition"]

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Creating and training the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(X_train, y_train)

In [None]:
# Making predictions on the test set
y_pred = random_forest_model.predict(X_test)
print("Predicted condition by Random Forest Classifier:")
print(y_pred)

Predicted condition by Random Forest Classifier:
['Major' 'Severe' 'Major' 'Minor' 'Severe' 'Major' 'Normal' 'Normal'
 'Severe' 'Major' 'Severe' 'Severe' 'Severe' 'Major' 'Major' 'Minor'
 'Minor' 'Severe' 'Normal' 'Severe' 'Minor' 'Normal' 'Severe' 'Minor'
 'Severe' 'Normal' 'Major' 'Severe' 'Normal' 'Normal' 'Severe' 'Minor'
 'Minor' 'Normal' 'Major' 'Normal' 'Normal' 'Major' 'Severe' 'Major'
 'Normal' 'Minor' 'Severe' 'Major' 'Normal' 'Severe' 'Normal' 'Major'
 'Normal' 'Minor' 'Severe' 'Minor' 'Minor' 'Major' 'Normal' 'Normal'
 'Minor' 'Major' 'Normal' 'Normal' 'Severe' 'Normal' 'Minor' 'Severe'
 'Severe' 'Normal' 'Normal' 'Normal' 'Severe' 'Minor' 'Severe' 'Minor'
 'Severe' 'Severe' 'Major' 'Major' 'Severe' 'Normal' 'Minor' 'Major'
 'Severe' 'Normal' 'Normal' 'Normal' 'Minor' 'Severe' 'Normal' 'Normal'
 'Major' 'Minor' 'Major' 'Minor' 'Minor' 'Major' 'Major' 'Major' 'Major'
 'Severe' 'Normal' 'Normal' 'Minor' 'Minor' 'Minor' 'Minor' 'Minor'
 'Major' 'Severe' 'Normal' 'Major' 'Major

In [None]:
# Create a DataFrame for visualization
viz_df = X_test.copy()
viz_df['Predicted_Conditions'] = y_pred

# Create a histogram to visualize the distribution of predicted conditions
fig = px.histogram(viz_df, x='Predicted_Conditions',nbins= 7, title='Distribution of Predicted Conditions By Random Forest Classifier')
fig.show()

In [None]:
# Getting feature importances from the trained model
feature_importances = random_forest_model.feature_importances_
feature_names = X.columns
# Creating a DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importances
})
# Sorting the DataFrame by importance
feature_importance_df = feature_importance_df.sort_values(by="Importance", ascending=False)
# Visualizing feature importance using Plotly Express bar chart
fig = px.bar(feature_importance_df, x='Feature', y='Importance', title='Feature Importance of Random Forest Classifier')
fig.show()

## Evaluation & Visualizing The Performance of The Model

In [None]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Displaying additional evaluation metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.7561904761904762

Classification Report:
               precision    recall  f1-score   support

       Major       0.70      0.74      0.72       135
       Minor       0.82      0.74      0.78       126
      Normal       0.83      0.76      0.79       145
      Severe       0.69      0.79      0.74       119

    accuracy                           0.76       525
   macro avg       0.76      0.76      0.76       525
weighted avg       0.76      0.76      0.76       525


Confusion Matrix:
 [[100  10   7  18]
 [ 13  93   8  12]
 [ 14   9 110  12]
 [ 15   2   8  94]]


In [None]:
fig = px.imshow(conf_matrix,
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Major', 'Minor', 'Normal', 'Severe'],
                y=['Major', 'Minor', 'Normal', 'Severe'],title="Confusion Matrix")

# Show the plot
fig.show()

## Data Modeling & Development

In [None]:
# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=6)
# Train the KNN model
knn.fit(X_train, y_train)

In [None]:
# Predict on the testing set
y_pred = knn.predict(X_test)
print("Predicted condition by K Neighors Classifier:")
print(y_pred)

Predicted condition by K Neighors Classifier:
['Major' 'Major' 'Major' 'Major' 'Minor' 'Minor' 'Minor' 'Severe' 'Major'
 'Major' 'Severe' 'Major' 'Severe' 'Minor' 'Major' 'Minor' 'Severe'
 'Severe' 'Minor' 'Major' 'Major' 'Normal' 'Severe' 'Major' 'Minor'
 'Major' 'Major' 'Severe' 'Normal' 'Normal' 'Severe' 'Severe' 'Minor'
 'Major' 'Major' 'Minor' 'Normal' 'Major' 'Normal' 'Normal' 'Major'
 'Severe' 'Severe' 'Minor' 'Major' 'Minor' 'Normal' 'Major' 'Normal'
 'Severe' 'Normal' 'Major' 'Major' 'Major' 'Major' 'Minor' 'Normal'
 'Minor' 'Severe' 'Normal' 'Major' 'Normal' 'Major' 'Major' 'Major'
 'Severe' 'Major' 'Normal' 'Severe' 'Severe' 'Major' 'Minor' 'Major'
 'Minor' 'Normal' 'Normal' 'Normal' 'Minor' 'Major' 'Major' 'Severe'
 'Major' 'Normal' 'Normal' 'Minor' 'Severe' 'Major' 'Normal' 'Severe'
 'Minor' 'Major' 'Minor' 'Minor' 'Major' 'Major' 'Major' 'Major' 'Minor'
 'Normal' 'Minor' 'Major' 'Minor' 'Minor' 'Minor' 'Minor' 'Minor' 'Major'
 'Major' 'Major' 'Major' 'Normal' 'Normal' 'Se

In [None]:
# Create a DataFrame for visualization
viz_df = X_test.copy()
viz_df['Predicted_Conditions'] = y_pred

# Create a histogram to visualize the distribution of predicted conditions
fig = px.histogram(viz_df, x='Predicted_Conditions',nbins= 7, title='Distribution of Predicted Conditions By K Neighbors Classifer')
fig.show()

## Evaluation & Visualizing The Performance of The Model

In [None]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Displaying additional evaluation metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.3923809523809524

Classification Report:
               precision    recall  f1-score   support

       Major       0.34      0.47      0.39       135
       Minor       0.40      0.44      0.42       126
      Normal       0.46      0.33      0.39       145
      Severe       0.40      0.33      0.36       119

    accuracy                           0.39       525
   macro avg       0.40      0.39      0.39       525
weighted avg       0.40      0.39      0.39       525


Confusion Matrix:
 [[63 23 28 21]
 [31 56 19 20]
 [47 33 48 17]
 [44 27  9 39]]


In [None]:
fig = px.imshow(conf_matrix,
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Major', 'Minor', 'Normal', 'Severe'],
                y=['Major', 'Minor', 'Normal', 'Severe'],title="Confusion Matrix")

# Show the plot
fig.show()

## Data Modeling & Development

In [None]:
# Initialize the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)# Train the Logistic Regression model
logreg.fit(X_train, y_train)

In [None]:
# Predict on the testing set
y_pred = logreg.predict(X_test)
print("Predicted condition by Logistic Regression:")
print(y_pred)

Predicted condition by Logistic Regression:
['Severe' 'Normal' 'Major' 'Minor' 'Minor' 'Normal' 'Severe' 'Severe'
 'Major' 'Severe' 'Normal' 'Severe' 'Major' 'Major' 'Major' 'Minor'
 'Minor' 'Minor' 'Normal' 'Severe' 'Normal' 'Severe' 'Severe' 'Severe'
 'Normal' 'Severe' 'Normal' 'Major' 'Minor' 'Normal' 'Severe' 'Minor'
 'Minor' 'Normal' 'Severe' 'Minor' 'Normal' 'Major' 'Severe' 'Major'
 'Major' 'Normal' 'Major' 'Major' 'Minor' 'Severe' 'Major' 'Minor' 'Major'
 'Severe' 'Minor' 'Major' 'Severe' 'Minor' 'Normal' 'Severe' 'Minor'
 'Major' 'Normal' 'Normal' 'Minor' 'Minor' 'Major' 'Major' 'Severe'
 'Severe' 'Major' 'Major' 'Major' 'Major' 'Severe' 'Normal' 'Severe'
 'Severe' 'Normal' 'Minor' 'Severe' 'Major' 'Minor' 'Minor' 'Minor'
 'Normal' 'Major' 'Major' 'Major' 'Severe' 'Major' 'Severe' 'Minor'
 'Minor' 'Major' 'Minor' 'Minor' 'Severe' 'Major' 'Minor' 'Major' 'Normal'
 'Major' 'Severe' 'Minor' 'Minor' 'Normal' 'Minor' 'Minor' 'Severe'
 'Severe' 'Major' 'Severe' 'Normal' 'Severe' 'Mi

In [None]:
# Create a DataFrame for visualization
viz_df = X_test.copy()
viz_df['Predicted_Conditions'] = y_pred

# Create a histogram to visualize the distribution of predicted conditions
fig = px.histogram(viz_df, x='Predicted_Conditions',nbins= 7, title='Distribution of Predicted Conditions By Logistic Regression')
fig.show()

## Evaluation & Visualizing The Performance of The Model

In [None]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Displaying additional evaluation metrics
print("\nClassification Report:\n", classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)

Accuracy: 0.30095238095238097

Classification Report:
               precision    recall  f1-score   support

       Major       0.29      0.30      0.30       135
       Minor       0.33      0.35      0.34       126
      Normal       0.37      0.23      0.29       145
      Severe       0.24      0.34      0.28       119

    accuracy                           0.30       525
   macro avg       0.31      0.30      0.30       525
weighted avg       0.31      0.30      0.30       525


Confusion Matrix:
 [[40 31 20 44]
 [24 44 22 36]
 [33 34 34 44]
 [39 23 17 40]]


In [None]:
fig = px.imshow(conf_matrix,
                labels=dict(x="Predicted", y="Actual", color="Count"),
                x=['Major', 'Minor', 'Normal', 'Severe'],
                y=['Major', 'Minor', 'Normal', 'Severe'],title="Confusion Matrix")

# Show the plot
fig.show()