In [1]:
from typing import TypeVar


T = TypeVar('T')

def pr(val : T , title:str|None = None) : # type: ignore
    if title != None :
        print(f" ---------------------- {title} ---------------------- ")
    print('type: ' , type(val))
    print(val , '\n')
    # return val

In [2]:
import pandas as pd
df = pd.read_csv('./data/car-sales.csv') # type: ignore
pr(df , 'car sales')

 ---------------------- car sales ---------------------- 
type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price
0  Toyota  White         150043      4   $4,000.00
1   Honda    Red          87899      4   $5,000.00
2  Toyota   Blue          32549      3   $7,000.00
3     BMW  Black          11179      5  $22,000.00
4  Nissan  White         213095      4   $3,500.00
5  Toyota  Green          99213      4   $4,500.00
6   Honda   Blue          45698      4   $7,500.00
7   Honda   Blue          54738      4   $7,000.00
8  Toyota  White          60000      4   $6,250.00
9  Nissan  White          31600      4   $9,700.00 



In [3]:
pr(df['Make'])
pr(df['Make'].str.lower())
pr('Make: '+df['Make'].str.lower())

type:  <class 'pandas.core.series.Series'>
0    Toyota
1     Honda
2    Toyota
3       BMW
4    Nissan
5    Toyota
6     Honda
7     Honda
8    Toyota
9    Nissan
Name: Make, dtype: object 

type:  <class 'pandas.core.series.Series'>
0    toyota
1     honda
2    toyota
3       bmw
4    nissan
5    toyota
6     honda
7     honda
8    toyota
9    nissan
Name: Make, dtype: object 

type:  <class 'pandas.core.series.Series'>
0    Make: toyota
1     Make: honda
2    Make: toyota
3       Make: bmw
4    Make: nissan
5    Make: toyota
6     Make: honda
7     Make: honda
8    Make: toyota
9    Make: nissan
Name: Make, dtype: object 



In [4]:
# i will import a csv with missing data for experimenting
df2 = pd.read_csv('./data/car-sales-missing-data.csv') # type: ignore
pr(df2)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer  Doors    Price
0  Toyota  White  150043.0    4.0   $4,000
1   Honda    Red   87899.0    4.0   $5,000
2  Toyota   Blue       NaN    3.0   $7,000
3     BMW  Black   11179.0    5.0  $22,000
4  Nissan  White  213095.0    4.0   $3,500
5  Toyota  Green       NaN    4.0   $4,500
6   Honda    NaN       NaN    4.0   $7,500
7   Honda   Blue       NaN    4.0      NaN
8  Toyota  White   60000.0    NaN      NaN
9     NaN  White   31600.0    4.0   $9,700 



In [5]:
# we will try to replace empty value with the mean value inplace
df2['Odometer'].fillna(df2['Odometer'].mean(skipna=True,numeric_only=True) , inplace=True) # type: ignore
pr(df2)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color       Odometer  Doors    Price
0  Toyota  White  150043.000000    4.0   $4,000
1   Honda    Red   87899.000000    4.0   $5,000
2  Toyota   Blue   92302.666667    3.0   $7,000
3     BMW  Black   11179.000000    5.0  $22,000
4  Nissan  White  213095.000000    4.0   $3,500
5  Toyota  Green   92302.666667    4.0   $4,500
6   Honda    NaN   92302.666667    4.0   $7,500
7   Honda   Blue   92302.666667    4.0      NaN
8  Toyota  White   60000.000000    NaN      NaN
9     NaN  White   31600.000000    4.0   $9,700 



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2['Odometer'].fillna(df2['Odometer'].mean(skipna=True,numeric_only=True) , inplace=True) # type: ignore


In [6]:
# now we will try to delete any row that have missing values 
df2.dropna(inplace=True)
pr(df2)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color       Odometer  Doors    Price
0  Toyota  White  150043.000000    4.0   $4,000
1   Honda    Red   87899.000000    4.0   $5,000
2  Toyota   Blue   92302.666667    3.0   $7,000
3     BMW  Black   11179.000000    5.0  $22,000
4  Nissan  White  213095.000000    4.0   $3,500
5  Toyota  Green   92302.666667    4.0   $4,500 



In [7]:
df2 = pd.read_csv('./data/car-sales-missing-data.csv') # type: ignore


# Manipulating data 2

In [8]:
pr(df)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price
0  Toyota  White         150043      4   $4,000.00
1   Honda    Red          87899      4   $5,000.00
2  Toyota   Blue          32549      3   $7,000.00
3     BMW  Black          11179      5  $22,000.00
4  Nissan  White         213095      4   $3,500.00
5  Toyota  Green          99213      4   $4,500.00
6   Honda   Blue          45698      4   $7,500.00
7   Honda   Blue          54738      4   $7,000.00
8  Toyota  White          60000      4   $6,250.00
9  Nissan  White          31600      4   $9,700.00 



In [None]:
# how to assign new column to an existing data frame.
seats_column = pd.Series([4,4,4,4,4,4,4,4])
df['car_seats'] = seats_column
pr(df)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price  car_seats
0  Toyota  White         150043      4   $4,000.00        4.0
1   Honda    Red          87899      4   $5,000.00        4.0
2  Toyota   Blue          32549      3   $7,000.00        4.0
3     BMW  Black          11179      5  $22,000.00        4.0
4  Nissan  White         213095      4   $3,500.00        4.0
5  Toyota  Green          99213      4   $4,500.00        4.0
6   Honda   Blue          45698      4   $7,500.00        4.0
7   Honda   Blue          54738      4   $7,000.00        4.0
8  Toyota  White          60000      4   $6,250.00        NaN
9  Nissan  White          31600      4   $9,700.00        NaN 



In [11]:
# now we face a problem that some car_seats rows don't have value
df['car_seats'].fillna(5 ,inplace=True ) # type: ignore
pr(df)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price  car_seats
0  Toyota  White         150043      4   $4,000.00        4.0
1   Honda    Red          87899      4   $5,000.00        4.0
2  Toyota   Blue          32549      3   $7,000.00        4.0
3     BMW  Black          11179      5  $22,000.00        4.0
4  Nissan  White         213095      4   $3,500.00        4.0
5  Toyota  Green          99213      4   $4,500.00        4.0
6   Honda   Blue          45698      4   $7,500.00        4.0
7   Honda   Blue          54738      4   $7,000.00        4.0
8  Toyota  White          60000      4   $6,250.00        5.0
9  Nissan  White          31600      4   $9,700.00        5.0 



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['car_seats'].fillna(5 ,inplace=True ) # type: ignore


In [13]:
# we could create a column from an ordinary list
fuel_economy : list[float] = [7.5,9,2,5.6,7.4,6,5,9,8,3]
df['fuel per 100 KM'] = fuel_economy
pr(df)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price  car_seats  fuel per 100 KM
0  Toyota  White         150043      4   $4,000.00        4.0              7.5
1   Honda    Red          87899      4   $5,000.00        4.0              9.0
2  Toyota   Blue          32549      3   $7,000.00        4.0              2.0
3     BMW  Black          11179      5  $22,000.00        4.0              5.6
4  Nissan  White         213095      4   $3,500.00        4.0              7.4
5  Toyota  Green          99213      4   $4,500.00        4.0              6.0
6   Honda   Blue          45698      4   $7,500.00        4.0              5.0
7   Honda   Blue          54738      4   $7,000.00        4.0              9.0
8  Toyota  White          60000      4   $6,250.00        5.0              8.0
9  Nissan  White          31600      4   $9,700.00        5.0              3.0 



In [14]:
# we will try to get the value of fuel each car consume in it's entire life
df['total fuel used'] = (df['Odometer (KM)'] /100) * df['car_seats']
pr(df)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price  car_seats  \
0  Toyota  White         150043      4   $4,000.00        4.0   
1   Honda    Red          87899      4   $5,000.00        4.0   
2  Toyota   Blue          32549      3   $7,000.00        4.0   
3     BMW  Black          11179      5  $22,000.00        4.0   
4  Nissan  White         213095      4   $3,500.00        4.0   
5  Toyota  Green          99213      4   $4,500.00        4.0   
6   Honda   Blue          45698      4   $7,500.00        4.0   
7   Honda   Blue          54738      4   $7,000.00        4.0   
8  Toyota  White          60000      4   $6,250.00        5.0   
9  Nissan  White          31600      4   $9,700.00        5.0   

   fuel per 100 KM  total fuel used  
0              7.5          6001.72  
1              9.0          3515.96  
2              2.0          1301.96  
3              5.6           447.16  
4              7.4          8523.80  
5          

In [15]:
# create a column from a single value.
df['Number of wheels'] = 4
pr(df)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price  car_seats  \
0  Toyota  White         150043      4   $4,000.00        4.0   
1   Honda    Red          87899      4   $5,000.00        4.0   
2  Toyota   Blue          32549      3   $7,000.00        4.0   
3     BMW  Black          11179      5  $22,000.00        4.0   
4  Nissan  White         213095      4   $3,500.00        4.0   
5  Toyota  Green          99213      4   $4,500.00        4.0   
6   Honda   Blue          45698      4   $7,500.00        4.0   
7   Honda   Blue          54738      4   $7,000.00        4.0   
8  Toyota  White          60000      4   $6,250.00        5.0   
9  Nissan  White          31600      4   $9,700.00        5.0   

   fuel per 100 KM  total fuel used  Number of wheels  
0              7.5          6001.72                 4  
1              9.0          3515.96                 4  
2              2.0          1301.96                 4  
3              

In [18]:
df['passed road safety'] = True 
pr(df)
pr(df.dtypes)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price  car_seats  \
0  Toyota  White         150043      4   $4,000.00        4.0   
1   Honda    Red          87899      4   $5,000.00        4.0   
2  Toyota   Blue          32549      3   $7,000.00        4.0   
3     BMW  Black          11179      5  $22,000.00        4.0   
4  Nissan  White         213095      4   $3,500.00        4.0   
5  Toyota  Green          99213      4   $4,500.00        4.0   
6   Honda   Blue          45698      4   $7,500.00        4.0   
7   Honda   Blue          54738      4   $7,000.00        4.0   
8  Toyota  White          60000      4   $6,250.00        5.0   
9  Nissan  White          31600      4   $9,700.00        5.0   

   fuel per 100 KM  total fuel used  Number of wheels  passed road safety  
0              7.5          6001.72                 4                True  
1              9.0          3515.96                 4                True  
2          

In [20]:
# we could drop column if we want 
df.drop('passed road safety' , axis=1 , inplace=True)
pr(df)

type:  <class 'pandas.core.frame.DataFrame'>
     Make  Color  Odometer (KM)  Doors       Price  car_seats  \
0  Toyota  White         150043      4   $4,000.00        4.0   
1   Honda    Red          87899      4   $5,000.00        4.0   
2  Toyota   Blue          32549      3   $7,000.00        4.0   
3     BMW  Black          11179      5  $22,000.00        4.0   
4  Nissan  White         213095      4   $3,500.00        4.0   
5  Toyota  Green          99213      4   $4,500.00        4.0   
6   Honda   Blue          45698      4   $7,500.00        4.0   
7   Honda   Blue          54738      4   $7,000.00        4.0   
8  Toyota  White          60000      4   $6,250.00        5.0   
9  Nissan  White          31600      4   $9,700.00        5.0   

   fuel per 100 KM  total fuel used  Number of wheels  
0              7.5          6001.72                 4  
1              9.0          3515.96                 4  
2              2.0          1301.96                 4  
3              