In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, LabelEncoder, OrdinalEncoder

In [8]:
df_custom = pd.read_csv('data.csv')
print("Custom Dataset Shape:", df_custom.shape)
print("\nCustom Dataset - First 5 rows:")
print(df_custom.head())

df_tips = sns.load_dataset('tips')
df_flights = sns.load_dataset('flights')
df_titanic = sns.load_dataset('titanic')

print(f"\nTips Dataset Shape: {df_tips.shape}")
print(f"Flights Dataset Shape: {df_flights.shape}")
print(f"Titanic Dataset Shape: {df_titanic.shape}")

Custom Dataset Shape: (20, 9)

Custom Dataset - First 5 rows:
   customer_id   age   income education_level  purchase_amount     category  \
0            1  25.0  45000.0        Bachelor           156.75  Electronics   
1            2   NaN  52000.0          Master            89.50     Clothing   
2            3  45.0  78000.0             PhD           234.60        Books   
3            4  29.0      NaN        Bachelor            67.80  Electronics   
4            5  35.0  61000.0             NaN           123.45     Clothing   

   satisfaction_rating  gender  total_purchases  
0                  4.2    Male               12  
1                  3.8  Female                8  
2                  4.5    Male               25  
3                  NaN  Female                5  
4                  3.2    Male               15  

Tips Dataset Shape: (244, 7)
Flights Dataset Shape: (144, 3)
Titanic Dataset Shape: (891, 15)


In [10]:
numerical_cols = ['age', 'income', 'purchase_amount', 'satisfaction_rating', 'total_purchases']
categorical_cols = ['education_level', 'category', 'gender']

print("\nCustom Dataset Info:")
print(df_custom.info())
print("\nMissing Values in Custom Dataset:")
print(df_custom.isnull().sum())

print("\nTitanic Dataset Missing Values:")
print(df_titanic.isnull().sum())

print("\nBasic Statistics - Custom Dataset:")
print(df_custom.describe())


Custom Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer_id          20 non-null     int64  
 1   age                  18 non-null     float64
 2   income               18 non-null     float64
 3   education_level      18 non-null     object 
 4   purchase_amount      19 non-null     float64
 5   category             20 non-null     object 
 6   satisfaction_rating  17 non-null     float64
 7   gender               20 non-null     object 
 8   total_purchases      20 non-null     int64  
dtypes: float64(4), int64(2), object(3)
memory usage: 1.5+ KB
None

Missing Values in Custom Dataset:
customer_id            0
age                    2
income                 2
education_level        2
purchase_amount        1
category               0
satisfaction_rating    3
gender                 0
total_purchases        0

In [11]:
from sklearn.impute import SimpleImputer
df_processed = df_custom.copy()
imputer = SimpleImputer(strategy='mean')
df_processed[numerical_cols] = imputer.fit_transform(df_processed[numerical_cols])
print("After mean imputation - missing values:")
print(df_processed.isnull().sum())
df_processed['education_level'].fillna(df_processed['education_level'].mode()[0], inplace=True)
df_processed['category'].fillna(df_processed['category'].mode()[0], inplace=True)
df_processed['gender'].fillna(df_processed['gender'].mode()[0], inplace=True)
print("After mode imputation - missing values:")
print(df_processed.isnull().sum())

After mean imputation - missing values:
customer_id            0
age                    0
income                 0
education_level        2
purchase_amount        0
category               0
satisfaction_rating    0
gender                 0
total_purchases        0
dtype: int64
After mode imputation - missing values:
customer_id            0
age                    0
income                 0
education_level        0
purchase_amount        0
category               0
satisfaction_rating    0
gender                 0
total_purchases        0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['education_level'].fillna(df_processed['education_level'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed['category'].fillna(df_processed['category'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace m

In [12]:
from sklearn.impute import KNNImputer
df_titanic_processed = df_titanic.copy()
imputer = KNNImputer(n_neighbors=5)
df_titanic_processed[['age']] = imputer.fit_transform(df_titanic_processed[['age']])
print("Titanic age missing values after KNN imputation:", df_titanic_processed['age'].isnull().sum())

Titanic age missing values after KNN imputation: 0


In [13]:
sample_data = df_custom.copy()
print("Before Forward Fill - missing values:")
print(sample_data.isnull().sum())
sample_data.fillna(method='ffill', inplace=True)
print("After Forward Fill - missing values:")
print(sample_data.isnull().sum())
sample_data.fillna(method='bfill', inplace=True)
print("After Backward Fill - missing values:")
print(sample_data.isnull().sum())

Before Forward Fill - missing values:
customer_id            0
age                    2
income                 2
education_level        2
purchase_amount        1
category               0
satisfaction_rating    3
gender                 0
total_purchases        0
dtype: int64
After Forward Fill - missing values:
customer_id            0
age                    0
income                 0
education_level        0
purchase_amount        0
category               0
satisfaction_rating    0
gender                 0
total_purchases        0
dtype: int64
After Backward Fill - missing values:
customer_id            0
age                    0
income                 0
education_level        0
purchase_amount        0
category               0
satisfaction_rating    0
gender                 0
total_purchases        0
dtype: int64


  sample_data.fillna(method='ffill', inplace=True)
  sample_data.fillna(method='bfill', inplace=True)


In [14]:
df_dropped_rows = df_custom.dropna(inplace=False)
print(f"Original shape: {df_custom.shape}")
print(f"After dropping rows with missing values: {df_dropped_rows.shape}")

df_dropped_cols = df_custom.dropna(axis=1, inplace=False)
print(f"After dropping columns with missing values: {df_dropped_cols.shape}")

Original shape: (20, 9)
After dropping rows with missing values: (11, 9)
After dropping columns with missing values: (20, 4)


In [15]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = df_processed.copy()

df_scaled[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])

print("Original statistics:")
print(df_processed[numerical_cols].describe())
print("\nAfter Min-Max Scaling:")
print(df_scaled[numerical_cols].describe())

Original statistics:
             age         income  purchase_amount  satisfaction_rating  \
count  20.000000      20.000000        20.000000            20.000000   
mean   36.111111   68277.777778       226.376316             4.111765   
std     9.408221   25765.757684       146.807945             0.562777   
min    22.000000   28000.000000        45.300000             2.800000   
25%    28.500000   50250.000000       117.275000             3.875000   
50%    35.555556   67638.888889       190.175000             4.111765   
75%    42.500000   79750.000000       310.475000             4.525000   
max    55.000000  120000.000000       567.900000             4.900000   

       total_purchases  
count        20.000000  
mean         21.700000  
std          13.183323  
min           3.000000  
25%          10.500000  
50%          21.500000  
75%          31.500000  
max          48.000000  

After Min-Max Scaling:
             age     income  purchase_amount  satisfaction_rating  \
cou

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_standard = df_processed.copy()
df_standard[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])

print("After Standard Scaling:")
print(df_standard[numerical_cols].describe())

After Standard Scaling:
                age        income  purchase_amount  satisfaction_rating  \
count  2.000000e+01  2.000000e+01     2.000000e+01         2.000000e+01   
mean   4.496403e-16 -1.096345e-16     1.562986e-16         8.104628e-16   
std    1.025978e+00  1.025978e+00     1.025978e+00         1.025978e+00   
min   -1.538834e+00 -1.603839e+00    -1.265465e+00        -2.391430e+00   
25%   -8.300013e-01 -7.178562e-01    -7.624627e-01        -4.316369e-01   
50%   -6.058403e-02 -2.544021e-02    -2.529956e-01         1.619205e-15   
75%    6.967164e-01  4.568176e-01     5.877300e-01         7.533539e-01   
max    2.059857e+00  2.059551e+00     2.386764e+00         1.437002e+00   

       total_purchases  
count     2.000000e+01  
mean      1.665335e-17  
std       1.025978e+00  
min      -1.455308e+00  
25%      -8.716283e-01  
50%      -1.556479e-02  
75%       7.626748e-01  
max       2.046770e+00  


In [17]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer(norm='l2')
df_normalized = df_processed.copy()
df_normalized[numerical_cols] = normalizer.fit_transform(df_processed[numerical_cols])

print("After L2 Normalization:")
print(df_normalized[numerical_cols].describe())

After L2 Normalization:
             age     income  purchase_amount  satisfaction_rating  \
count  20.000000  20.000000        20.000000            20.000000   
mean    0.000560   0.999994         0.003104             0.000068   
std     0.000109   0.000004         0.001164             0.000025   
min     0.000400   0.999982         0.000993             0.000041   
25%     0.000469   0.999993         0.002257             0.000050   
50%     0.000552   0.999995         0.003231             0.000061   
75%     0.000637   0.999997         0.003679             0.000076   
max     0.000786   0.999999         0.006010             0.000125   

       total_purchases  
count        20.000000  
mean          0.000296  
std           0.000111  
min           0.000073  
25%           0.000242  
50%           0.000322  
75%           0.000338  
max           0.000539  


In [18]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df_processed.copy()

le = LabelEncoder()
df_encoded['gender_encoded'] = le.fit_transform(df_encoded['gender'])

print("Gender Label Encoding:")
print(df_encoded[['gender', 'gender_encoded']].drop_duplicates().sort_values('gender'))

Gender Label Encoding:
   gender  gender_encoded
1  Female               0
0    Male               1


In [19]:
df_onehot = pd.get_dummies(df_encoded['category'], prefix='category')
print("One-hot encoded category columns:")
print(df_onehot.columns)
print("\nOne-hot encoded data:")
print(df_onehot.head())

df_encoded = pd.concat([df_encoded, df_onehot], axis=1)
df_encoded.drop('category', axis=1, inplace=True)

One-hot encoded category columns:
Index(['category_Books', 'category_Clothing', 'category_Electronics'], dtype='object')

One-hot encoded data:
   category_Books  category_Clothing  category_Electronics
0           False              False                  True
1           False               True                 False
2            True              False                 False
3           False              False                  True
4           False               True                 False


In [20]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['High School', 'Bachelor', 'Master', 'PhD']])
df_encoded['education_level_encoded'] = encoder.fit_transform(df_encoded[['education_level']])

print("Education Level Ordinal Encoding:")
education_mapping = pd.DataFrame({
    'education_level': ['High School', 'Bachelor', 'Master', 'PhD'],
    'encoded_value': [0, 1, 2, 3]
})
print(education_mapping)

Education Level Ordinal Encoding:
  education_level  encoded_value
0     High School              0
1        Bachelor              1
2          Master              2
3             PhD              3


In [21]:
df_binned = df_processed.copy()

df_binned['age_bins_equal'] = pd.cut(df_binned['age'], bins=4, labels=['Young', 'Adult', 'Middle-aged', 'Senior'])
print("Age equal-width binning:")
print(df_binned['age_bins_equal'].value_counts())

Age equal-width binning:
age_bins_equal
Adult          7
Young          6
Middle-aged    4
Senior         3
Name: count, dtype: int64


In [22]:
df_binned['age_bins_quantile'] = pd.qcut(df_binned['age'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
print("Age equal-frequency binning:")
print(df_binned['age_bins_quantile'].value_counts())

Age equal-frequency binning:
age_bins_quantile
Q1    5
Q2    5
Q3    5
Q4    5
Name: count, dtype: int64


In [23]:
custom_bins = [0, 30, 50, 70, 100]
custom_labels = ['Youth', 'Young Adult', 'Middle Age', 'Senior']
df_binned['age_bins_custom'] = pd.cut(df_binned['age'], bins=custom_bins, labels=custom_labels)

print("Custom age binning:")
print(df_binned['age_bins_custom'].value_counts())

Custom age binning:
age_bins_custom
Young Adult    12
Youth           6
Middle Age      2
Senior          0
Name: count, dtype: int64


In [24]:
df_tips_processed = df_tips.copy()
print("Tips dataset missing values:", df_tips_processed.isnull().sum().sum())

scaler = StandardScaler()
df_tips_processed[['total_bill', 'tip', 'size']] = scaler.fit_transform(df_tips_processed[['total_bill', 'tip', 'size']])

df_tips_processed = pd.get_dummies(df_tips_processed, columns=['sex', 'smoker', 'day', 'time'])

print("Tips dataset after preprocessing shape:", df_tips_processed.shape)
print("Tips dataset columns:", list(df_tips_processed.columns))

Tips dataset missing values: 0
Tips dataset after preprocessing shape: (244, 13)
Tips dataset columns: ['total_bill', 'tip', 'size', 'sex_Male', 'sex_Female', 'smoker_Yes', 'smoker_No', 'day_Thur', 'day_Fri', 'day_Sat', 'day_Sun', 'time_Lunch', 'time_Dinner']


In [25]:
df_titanic_final = df_titanic.copy()

imputer = KNNImputer(n_neighbors=5)
df_titanic_final[['age']] = imputer.fit_transform(df_titanic_final[['age']])

df_titanic_final['embarked'].fillna(df_titanic_final['embarked'].mode()[0], inplace=True)

df_titanic_final.drop('deck', axis=1, inplace=True)

titanic_numerical_cols = ['age', 'fare']
scaler = StandardScaler()
df_titanic_final[titanic_numerical_cols] = scaler.fit_transform(df_titanic_final[titanic_numerical_cols])

le = LabelEncoder()
df_titanic_final['sex_encoded'] = le.fit_transform(df_titanic_final['sex'])
df_titanic_final['alive_encoded'] = le.fit_transform(df_titanic_final['alive'])

df_titanic_final = pd.get_dummies(df_titanic_final, columns=['embarked', 'class', 'who'])

print("Titanic dataset after preprocessing shape:", df_titanic_final.shape)
print("Missing values after preprocessing:", df_titanic_final.isnull().sum().sum())

Titanic dataset after preprocessing shape: (891, 22)
Missing values after preprocessing: 2


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_titanic_final['embarked'].fillna(df_titanic_final['embarked'].mode()[0], inplace=True)


In [26]:
df_flights_processed = df_flights.copy()

scaler = MinMaxScaler()
df_flights_processed[['passengers']] = scaler.fit_transform(df_flights_processed[['passengers']])

df_flights_processed['decade'] = pd.cut(df_flights_processed['year'], 
                                      bins=[1940, 1950, 1960, 1970], 
                                      labels=['1940s', '1950s', '1960s'])

df_flights_processed = pd.get_dummies(df_flights_processed, columns=['month'])

print("Flights dataset after preprocessing shape:", df_flights_processed.shape)

Flights dataset after preprocessing shape: (144, 15)


In [27]:
print(f"Custom Dataset: {df_encoded.shape}")
print(f"Tips Dataset: {df_tips_processed.shape}")
print(f"Titanic Dataset: {df_titanic_final.shape}")
print(f"Flights Dataset: {df_flights_processed.shape}")

Custom Dataset: (20, 13)
Tips Dataset: (244, 13)
Titanic Dataset: (891, 22)
Flights Dataset: (144, 15)
