<a href="https://colab.research.google.com/github/DeepakKumar2005fg/AIML-/blob/main/Untitled12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Step 1: Load the dataset
df = pd.read_csv('/content/concrete.csv')

# Display the first few rows of the dataframe
print("First few rows of the dataset:")
print(df.head())

# Step 2: Explore the dataset
print("\nDataset Information:")
print(df.info())

print("\nDescriptive Statistics:")
print(df.describe())

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)

# Step 3: Handle missing values
# Dropping rows with any missing values
df_cleaned = df.dropna()

# Alternatively, filling missing values
# For numerical columns, filling with the mean
for column in df.select_dtypes(include=[np.number]).columns:
    df[column].fillna(df[column].mean(), inplace=True)

# For categorical columns, filling with the mode
for column in df.select_dtypes(include=[object]).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Step 4: Data Transformation
# Normalizing a numerical column (example: 'numerical_column')
if 'numerical_column' in df.columns:
    df['normalized_column'] = (df['numerical_column'] - df['numerical_column'].mean()) / df['numerical_column'].std()

# One-hot encoding for categorical variables (example: 'categorical_column')
if 'categorical_column' in df.columns:
    df = pd.get_dummies(df, columns=['categorical_column'], drop_first=True)

# Step 5: Feature Engineering
# Creating a new feature (example: product of two features)
if 'feature1' in df.columns and 'feature2' in df.columns:
    df['new_feature'] = df['feature1'] * df['feature2']

# If there's a datetime column, extract year, month, and day
if 'date_column' in df.columns:
    df['date_column'] = pd.to_datetime(df['date_column'])
    df['year'] = df['date_column'].dt.year
    df['month'] = df['date_column'].dt.month
    df['day'] = df['date_column'].dt.day

# Step 6: Save the preprocessed dataset
df.to_csv('cleaned_data.csv', index=False)
print("\nPreprocessed data saved to 'cleaned_data.csv'.")

First few rows of the dataset:
   cement   slag  ash  water  superplastic  coarseagg  fineagg  age  strength
0   540.0    0.0  0.0  162.0           2.5     1040.0    676.0   28     79.99
1   540.0    0.0  0.0  162.0           2.5     1055.0    676.0   28     61.89
2   332.5  142.5  0.0  228.0           0.0      932.0    594.0  270     40.27
3   332.5  142.5  0.0  228.0           0.0      932.0    594.0  365     41.05
4   198.6  132.4  0.0  192.0           0.0      978.4    825.5  360     44.30

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cement        1030 non-null   float64
 1   slag          1030 non-null   float64
 2   ash           1030 non-null   float64
 3   water         1030 non-null   float64
 4   superplastic  1030 non-null   float64
 5   coarseagg     1030 non-null   float64
 6   fineagg       1030 non-n

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)
