# Demonstrate various data pre-processing techniques for a given dataset. 

### 1.Reshaping the data, 

In [42]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [43]:
df = pd.read_csv('./diabetes.csv')

In [44]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


### 2. filter the data

display the people whose age is less than 50

In [4]:
def filter_data(df, condition):
    df_filtered = df[df['Age']<condition]
    return df_filtered

print(filter_data(df, 50))

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
5              5      116             74              0        0  25.6   
..           ...      ...            ...            ...      ...   ...   
762            9       89             62              0        0  22.5   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
1                       0.351   31        0  
2                  

display the age and BP of the people whose age is less than 50

In [5]:
def filter_data(df, age, columns):
    df_filtered = df[columns][df['Age']<age]
    return df_filtered

filter_data(df,50,['Age', 'BloodPressure'])

Unnamed: 0,Age,BloodPressure
1,31,66
2,32,64
3,21,66
4,33,40
5,30,74
...,...,...
762,33,62
764,27,70
765,30,72
766,47,60


### 3. Merging the data 

In [6]:
df1 = pd.DataFrame({
    "name":["amal", "athul", "nazeem"],
    "age" : [24,26,23]
})

df2 = pd.DataFrame({
    "name" : ["amal", "athul", "nazeem","satya"],
    "marks" : [30,50,45,49]
})


In [7]:
df1

Unnamed: 0,name,age
0,amal,24
1,athul,26
2,nazeem,23


In [8]:
df2

Unnamed: 0,name,marks
0,amal,30
1,athul,50
2,nazeem,45
3,satya,49


In [9]:
df3 = pd.merge(df1, df2, how="left",on="name")

In [10]:
df3

Unnamed: 0,name,age,marks
0,amal,24,30
1,athul,26,50
2,nazeem,23,45


### 4. Handling the missing values in datasets 

In [11]:
df = pd.read_csv("./DP_LIVE_04102023172539884.csv")

In [12]:
df.isnull().sum()

LOCATION        0
INDICATOR       0
SUBJECT         0
MEASURE         0
FREQUENCY       0
TIME            0
Value           0
Flag Codes    225
dtype: int64

1. drop na

In [13]:
df1 = df.copy()

In [14]:
df1.isnull().sum()

LOCATION        0
INDICATOR       0
SUBJECT         0
MEASURE         0
FREQUENCY       0
TIME            0
Value           0
Flag Codes    225
dtype: int64

In [15]:
df1.dropna(axis=1, inplace=True)

In [16]:
df1.isnull().sum()

LOCATION     0
INDICATOR    0
SUBJECT      0
MEASURE      0
FREQUENCY    0
TIME         0
Value        0
dtype: int64

successfully droped

In [17]:
df2 = df.copy()

2. fill na usinhg mean or median or mode

In [25]:
df2_imputed = df2.fillna(df.mean())
df2_imputed.isnull().sum()


  df2_imputed = df2.fillna(df.mean())


LOCATION     0
INDICATOR    0
SUBJECT      0
MEASURE      0
FREQUENCY    0
TIME         0
Value        0
dtype: int64

In [26]:
df2_imputed = df2.fillna(df.mean())

missing_values_count = df2_imputed.isnull().sum()
print("Missing Values Count after imputation:")
print(missing_values_count)


Missing Values Count after imputation:
LOCATION     0
INDICATOR    0
SUBJECT      0
MEASURE      0
FREQUENCY    0
TIME         0
Value        0
dtype: int64


  df2_imputed = df2.fillna(df.mean())


this means maybe flag is a non numericc value so we can drop it

In [32]:
df2 = df.copy()

In [33]:
df2.drop(['Flag Codes'], axis=1, inplace=True)
df2.columns

Index(['LOCATION', 'INDICATOR', 'SUBJECT', 'MEASURE', 'FREQUENCY', 'TIME',
       'Value'],
      dtype='object')

### 5.Feature Normalization: Min-max normalization 

In [36]:
df

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,HRWKD,TOT,HR_WKD,A,2018,1733.139979,
1,AUS,HRWKD,TOT,HR_WKD,A,2019,1722.925469,
2,AUS,HRWKD,TOT,HR_WKD,A,2020,1683.851538,
3,AUS,HRWKD,TOT,HR_WKD,A,2021,1694.799343,
4,AUS,HRWKD,TOT,HR_WKD,A,2022,1707.325609,
...,...,...,...,...,...,...,...,...
220,EU27,HRWKD,TOT,HR_WKD,A,2018,1598.046690,
221,EU27,HRWKD,TOT,HR_WKD,A,2019,1592.987477,
222,EU27,HRWKD,TOT,HR_WKD,A,2020,1505.611235,
223,EU27,HRWKD,TOT,HR_WKD,A,2021,1560.286783,


In [56]:
from sklearn.preprocessing import MinMaxScaler

In [57]:
df = pd.read_csv("./diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [58]:
numeric_values = df.select_dtypes(include='number').columns.to_list()

In [59]:
numeric_values

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [60]:
scaler = MinMaxScaler()
df[numeric_values] = scaler.fit_transform(df[numeric_values])

In [61]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333,1.0
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667,0.0
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333,1.0
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000,0.0
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000,1.0
...,...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000,0.0
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000,0.0
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000,0.0
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333,1.0
