In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [3]:
data = pd.read_csv('./Data/data.csv')
data

Unnamed: 0,name,city,gender,profession,age,cgpa,placed
0,moriarity,,female,phd,28.0,5.94,1
1,moriarity,asgard,,bachelor,50.0,8.55,0
2,holmes,,female,masters,18.0,5.56,0
3,sam,,male,bachelor,25.0,8.57,1
4,sam,,male,bachelor,19.0,8.76,1
...,...,...,...,...,...,...,...
1095,holmes,,male,masters,26.0,8.92,1
1096,moriarity,wakanda,male,masters,19.0,9.01,0
1097,sam,asgard,male,bachelor,30.0,7.88,0
1098,dean,gotham,male,masters,28.0,,1


In [4]:
data.isna().sum()

name           91
city          195
gender        106
profession    179
age           118
cgpa          138
placed          0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   name        1009 non-null   object 
 1   city        905 non-null    object 
 2   gender      994 non-null    object 
 3   profession  921 non-null    object 
 4   age         982 non-null    float64
 5   cgpa        962 non-null    float64
 6   placed      1100 non-null   int64  
dtypes: float64(2), int64(1), object(4)
memory usage: 60.3+ KB


In [6]:
# Data Cleaning : Feature Selection, Handling Missing Values
data.drop(columns=['name'], inplace=True)
for col in ['age','cgpa']:
    data[col].fillna(data[col].median(), inplace=True)
for col in ['city','gender','profession']:
    data[col].fillna(data[col].mode()[0], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   city        1100 non-null   object 
 1   gender      1100 non-null   object 
 2   profession  1100 non-null   object 
 3   age         1100 non-null   float64
 4   cgpa        1100 non-null   float64
 5   placed      1100 non-null   int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 51.7+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [7]:
data['profession'].value_counts()

profession
bachelor    696
masters     273
phd         131
Name: count, dtype: int64

In [8]:
data['city'].value_counts()

city
wakanda       586
gotham        243
asgard        152
purgatory     119
Name: count, dtype: int64

In [9]:
data['gender'].value_counts()

gender
male      866
female    234
Name: count, dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

data['profession'] = data['profession'].map({'bachelor': 0, 'masters': 1, 'phd': 2})
# Converting 'gender' and 'city' into dummy variables
# drop_first=True is used to avoid the 'Dummy Variable Trap' (multicollinearity)
data = pd.get_dummies(data, columns=['gender', 'city'], drop_first=True)
data.head(15)

Unnamed: 0,profession,age,cgpa,placed,gender_male,city_gotham,city_purgatory,city_wakanda
0,2,28.0,5.94,1,False,False,False,True
1,0,50.0,8.55,0,True,False,False,False
2,1,18.0,5.56,0,False,False,False,True
3,0,25.0,8.57,1,True,False,False,True
4,0,19.0,8.76,1,True,False,False,True
5,0,20.0,7.68,0,True,False,True,False
6,0,20.0,9.01,0,True,False,False,True
7,0,20.0,8.31,0,True,False,False,True
8,0,19.0,7.2,0,False,False,False,False
9,1,26.0,8.35,1,False,False,False,True


In [11]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Feature Scaling
data[['age', 'cgpa']] = StandardScaler().fit_transform(data[['age', 'cgpa']])
data.describe().round(2)

Unnamed: 0,profession,age,cgpa,placed
count,1100.0,1100.0,1100.0,1100.0
mean,0.49,0.0,-0.0,0.5
std,0.7,1.0,1.0,0.5
min,0.0,-1.17,-2.59,0.0
25%,0.0,-0.71,-0.27,0.0
50%,0.0,-0.25,0.2,0.0
75%,1.0,0.1,0.71,1.0
max,2.0,2.52,1.31,1.0


In [12]:
data.to_csv('./Data/cleanedData.csv',index=False)