## Importing Libraries

In [1]:
# for basic mathematics operation 
import numpy as np

# for dataframe manipulations
import pandas as pd

# for Data Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

## Read Data

In [2]:
data = pd.read_csv('combined_output_cleand_part_1.csv')

In [3]:
data.head()

Unnamed: 0,timestamp,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
0,2017-02-01 02:02:07.000000,0.0,10092110.0,119.0944,1609800.0,84.59782,0.0
1,2017-02-01 02:02:08.000000,0.0,10092000.0,119.0944,1618206.0,84.58997,0.0
2,2017-02-01 02:02:09.000000,0.0,10091890.0,119.0944,1626612.0,84.58213,0.0
3,2017-02-01 02:02:10.000000,0.0,10091780.0,119.0944,1635018.0,84.57429,0.0
4,2017-02-01 02:02:11.000000,0.0,10091670.0,119.0944,1643424.0,84.56644,0.0


`Convert timestamp to datetime`

In [4]:
data['timestamp'] = pd.to_datetime(data['timestamp'])

`Extract components (Year , Month , Day , Hour , Minute , Second)`

In [5]:
data['year'] = data['timestamp'].dt.year
data['month'] = data['timestamp'].dt.month
data['day'] = data['timestamp'].dt.day
data['hour'] = data['timestamp'].dt.hour
data['minute'] = data['timestamp'].dt.minute
data['second'] = data['timestamp'].dt.second

In [6]:
data = data.drop('timestamp' , axis = 1)

In [7]:
data.head()

Unnamed: 0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class,year,month,day,hour,minute,second
0,0.0,10092110.0,119.0944,1609800.0,84.59782,0.0,2017,2,1,2,2,7
1,0.0,10092000.0,119.0944,1618206.0,84.58997,0.0,2017,2,1,2,2,8
2,0.0,10091890.0,119.0944,1626612.0,84.58213,0.0,2017,2,1,2,2,9
3,0.0,10091780.0,119.0944,1635018.0,84.57429,0.0,2017,2,1,2,2,10
4,0.0,10091670.0,119.0944,1643424.0,84.56644,0.0,2017,2,1,2,2,11


## Data Cleaning 

`Check Null Values`

In [8]:
null = pd.DataFrame({'Null Values' : data.isna().sum().sort_values(ascending=False), 'Percentage Null Values' : (data.isna().sum().sort_values(ascending=False)) / (data.shape[0]) * (100)})
null.style.background_gradient(cmap="Wistia")

Unnamed: 0,Null Values,Percentage Null Values
T-TPT,5810976,11.413493
T-JUS-CKP,1702789,3.344493
P-MON-CKP,1121628,2.203019
P-TPT,6184,0.012146
P-PDG,5905,0.011598
class,5130,0.010076
year,0,0.0
month,0,0.0
day,0,0.0
hour,0,0.0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50913215 entries, 0 to 50913214
Data columns (total 12 columns):
 #   Column     Dtype  
---  ------     -----  
 0   P-PDG      float64
 1   P-TPT      float64
 2   T-TPT      float64
 3   P-MON-CKP  float64
 4   T-JUS-CKP  float64
 5   class      float64
 6   year       int32  
 7   month      int32  
 8   day        int32  
 9   hour       int32  
 10  minute     int32  
 11  second     int32  
dtypes: float64(6), int32(6)
memory usage: 3.4 GB


`Fill null values with the mean of the column except class with mode`

In [13]:
data['T-TPT'] = data['T-TPT'].fillna(data['T-TPT'].mean())
data['T-JUS-CKP'] = data['T-JUS-CKP'].fillna(data['T-JUS-CKP'].mean())
data['P-MON-CKP'] = data['P-MON-CKP'].fillna(data['P-MON-CKP'].mean())
data['P-TPT'] = data['P-TPT'].fillna(data['P-TPT'].mean())
data['P-PDG'] = data['P-PDG'].fillna(data['P-PDG'].mean())
mode_value = data['class'].mode()[0]
data['class'] = data['class'].fillna(mode_value)

In [14]:
null = pd.DataFrame({'Null Values' : data.isna().sum().sort_values(ascending=False), 'Percentage Null Values' : (data.isna().sum().sort_values(ascending=False)) / (data.shape[0]) * (100)})
null.style.background_gradient(cmap="Wistia")

Unnamed: 0,Null Values,Percentage Null Values
P-PDG,0,0.0
P-TPT,0,0.0
T-TPT,0,0.0
P-MON-CKP,0,0.0
T-JUS-CKP,0,0.0
class,0,0.0
year,0,0.0
month,0,0.0
day,0,0.0
hour,0,0.0


`Convert class , P-PDG columns to int`

In [18]:
data['class'] = data['class'].astype(int)
data['P-PDG'] = data['P-PDG'].astype(int)

`Replace values with 1 at class column to have 0 and 1`

In [16]:
data['class'] = data['class'].apply(lambda x: 0 if x == 0 else 1)

In [19]:
data.head()

Unnamed: 0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class,year,month,day,hour,minute,second
0,0,10092110.0,119.0944,1609800.0,84.59782,0,2017,2,1,2,2,7
1,0,10092000.0,119.0944,1618206.0,84.58997,0,2017,2,1,2,2,8
2,0,10091890.0,119.0944,1626612.0,84.58213,0,2017,2,1,2,2,9
3,0,10091780.0,119.0944,1635018.0,84.57429,0,2017,2,1,2,2,10
4,0,10091670.0,119.0944,1643424.0,84.56644,0,2017,2,1,2,2,11


## Save Data as a CSV file

In [20]:
data.to_csv("combined_output_cleaned_part_2.csv", index=False)
print("combined_output_cleaned_part_2 has been saved to 'combined_output_cleaned_part_2.csv'")

combined_output_cleaned_part_2 has been saved to 'combined_output_cleaned_part_2.csv'
