<a href="https://colab.research.google.com/github/AnjaliAnand395/DATA-PIPELINE-DEVELOPMENT/blob/main/TASK1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os

In [None]:
# File paths

INPUT_FILE='/content/diabetes.xlsx'
OUTPUT_DIR='./processed_data/'
PROCESSED_FILE=os.path.join(OUTPUT_DIR, 'diabetes_processed.csv')

In [None]:
# Load Data
print('Loading data...')
df=pd.read_excel(INPUT_FILE)
print(f'Data loaded.shape: {df.shape}')

Loading data...
Data loaded.shape: (768, 9)


In [None]:
# Basic data info

print('\nData Info: ')
df.info()
print('\nSample Data: ')
print(df.head())


Data Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB

Sample Data: 
   Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72  ...                     0.627   50        1
1            1       85             66  ... 

In [None]:
# Handling missing values

print('\nChecking for missing values...')
missing_summary=df.isna().sum()
print(missing_summary[missing_summary>0])
df.dropna(inplace=True)
print(f'After dropping missing values: {df.shape}')


Checking for missing values...
Series([], dtype: int64)
After dropping missing values: (768, 9)


In [None]:
# Splitting features and target

X = df.drop(columns=['Outcome'])
y = df['Outcome']

In [None]:
# Define numerical and categorical columns

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
# Creating the preprocessing pipeline

print('\nCreating preprocessing pipeline...')
preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])


Creating preprocessing pipeline...


In [None]:
# Combine preprocessing into a single pipeline

pipeline=Pipeline([
    ('preprocessor', preprocessor)

])

In [None]:
# Apply transformations

print('\nTransforming data...')
X_processed=pipeline.fit_transform(X)


Transforming data...


In [None]:
# Save the processed data

print('\nSaving processed data...')
os.makedirs(OUTPUT_DIR, exist_ok=True)
df_processed=pd.DataFrame(X_processed)
df_processed['Outcome']=y.reset_index(drop=True)
df_processed.to_csv(PROCESSED_FILE, index=False)
print(f'Processed data saved to {PROCESSED_FILE}')



Saving processed data...
Processed data saved to ./processed_data/diabetes_processed.csv


In [None]:
print('\nETL process complete. ')


ETL process complete. 


In [None]:
# Load the processed file

processed_file = '/content/processed_data/diabetes_processed.csv'
df_processed = pd.read_csv(processed_file)

In [None]:
# Preview the data

print(df_processed.head())
print("\nData INfo:")
df_processed.info()


          0         1         2  ...         6         7  Outcome
0  0.639947  0.848324  0.149641  ...  0.468492  1.425995        1
1 -0.844885 -1.123396 -0.160546  ... -0.365061 -0.190672        0
2  1.233880  1.943724 -0.263941  ...  0.604397 -0.105584        1
3 -0.844885 -0.998208 -0.160546  ... -0.920763 -1.041549        0
4 -1.141852  0.504055 -1.504687  ...  5.484909 -0.020496        1

[5 rows x 9 columns]

Data INfo:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   0        768 non-null    float64
 1   1        768 non-null    float64
 2   2        768 non-null    float64
 3   3        768 non-null    float64
 4   4        768 non-null    float64
 5   5        768 non-null    float64
 6   6        768 non-null    float64
 7   7        768 non-null    float64
 8   Outcome  768 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 54.1 