In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
# Sample DataFrame with missing values
data = {
    "A": [1, 2, None, 4, 5],
    "B": [None, 2, 3, 4, 5],
    "C": ["cat", "dog", None, "cat", "dog"]
}

# Convert the dictionary into a pandas DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:\n", df)

# Initialize the KNN Imputer
# n_neighbors specifies the number of nearest neighbors to consider
imputer = KNNImputer(n_neighbors=2)

# Use the KNN imputer to fill in missing values
# Note: Categorical data needs to be converted to numerical format for KNN
# we'll encode categorical data using one-hot encoding
df_encoded = pd.get_dummies(df, columns=["C"], drop_first=True)
imputed_data = imputer.fit_transform(df_encoded)
print('Encoded data:\n', df_encoded)

# Convert back to DataFrame and restore original column names
imputed_df = pd.DataFrame(imputed_data, columns=df_encoded.columns)
print("\nImputed DataFrame:\n", imputed_df)


Original DataFrame:
      A    B     C
0  1.0  NaN   cat
1  2.0  2.0   dog
2  NaN  3.0  None
3  4.0  4.0   cat
4  5.0  5.0   dog
Encoded data:
      A    B  C_dog
0  1.0  NaN  False
1  2.0  2.0   True
2  NaN  3.0  False
3  4.0  4.0  False
4  5.0  5.0   True

Imputed DataFrame:
      A    B  C_dog
0  1.0  2.5    0.0
1  2.0  2.0    1.0
2  2.5  3.0    0.0
3  4.0  4.0    0.0
4  5.0  5.0    1.0


In [2]:
import pandas as pd

# 5. Handling Missing Data:
data_with_nan = {
    'vin': ['1HGCM82633A123456', '1HGCM82633A654321', '1HGCM82633A789012'],
    'manufacturer': ['Toyota', 'Ford', None],
    'year': [2020, None, 2021],
    'color': ['black', 'white', 'silver'],
    'body_type': ['Sedan', 'SUV', 'Coupe'],
    'engine_type': ['petrol', 'diesel', 'electric'],
    'transmission': ['manual', 'automatic', None],
    'fuel_type': ['gasoline', 'diesel', 'electric'],
    'seating_capacity': [5, 7, 4],
    'price': [20000.00, 25000.00, None],
    'status': ['active', 'sold', 'inactive'],
    'registration_date': ['2020-05-20', '2018-07-15', '2021-01-10']
}

df_with_nan = pd.DataFrame(data_with_nan)

# Drop rows with missing values
df_no_nan = df_with_nan.dropna()
print(df_no_nan)

# Fill missing values
df_filled_nan = df_with_nan.fillna({'manufacturer': 'Unknown', 'year': 2020, 'transmission': 'manual', 'price': 0})
print(df_filled_nan)

print("6. Aggregating Data:")
grouped_df = df_with_nan.groupby('manufacturer')['price'].sum()
print(grouped_df)


                 vin manufacturer    year  color body_type engine_type  \
0  1HGCM82633A123456       Toyota  2020.0  black     Sedan      petrol   

  transmission fuel_type  seating_capacity    price  status registration_date  
0       manual  gasoline                 5  20000.0  active        2020-05-20  
                 vin manufacturer    year   color body_type engine_type  \
0  1HGCM82633A123456       Toyota  2020.0   black     Sedan      petrol   
1  1HGCM82633A654321         Ford  2020.0   white       SUV      diesel   
2  1HGCM82633A789012      Unknown  2021.0  silver     Coupe    electric   

  transmission fuel_type  seating_capacity    price    status  \
0       manual  gasoline                 5  20000.0    active   
1    automatic    diesel                 7  25000.0      sold   
2       manual  electric                 4      0.0  inactive   

  registration_date  
0        2020-05-20  
1        2018-07-15  
2        2021-01-10  
6. Aggregating Data:
manufacturer
Ford   

In [3]:
import pandas as pd

df1 = pd.DataFrame({"id": [1, 2, 3, 4], "feature1": ["A", "B", "C", "D"]})
df2 = pd.DataFrame({"id": [3, 4, 5, 6], "feature2": ["X", "Y", "Z", "W"]})

inner_join_df = pd.merge(df1, df2, on="id", how="inner")
print(inner_join_df)


   id feature1 feature2
0   3        C        X
1   4        D        Y
