## This notebook records some of the tricks I have learnt from work and study with Python

## String manipulation

### Example 1: How to read a string column, extract certain pattern in each row (within that column) and create a new column

In [1]:
import pandas as pd
import numpy as np

In [2]:
P_df = pd.DataFrame({'VEH_INFO' : ['Coupe XP 4S','Sedan XLT 4cylinder','PICKUP EXTRA','SEDAN XL LT','COUPE 4_cyl SE','pickup XST','Wagon SET 4wd'], 'Value':[100,200,300,400,500,600,700]})
P_df

Unnamed: 0,VEH_INFO,Value
0,Coupe XP 4S,100
1,Sedan XLT 4cylinder,200
2,PICKUP EXTRA,300
3,SEDAN XL LT,400
4,COUPE 4_cyl SE,500
5,pickup XST,600
6,Wagon SET 4wd,700


In [3]:
# I wanted to find out the vehicle style of each obervation, either coupe, or sedan or pickup or something else


In [4]:
# Define a function to identify if a keyword is contained:
def VEH_STYLE_FN(DAT):
    DAT = pd.Series(DAT)
    if int(DAT.str.contains('COUPE')) ==1:
        return('COUPE')
    if int(DAT.str.contains('PICKUP')) ==1:
        return('PICKUP')
    if int(DAT.str.contains('SEDAN')) ==1:
        return('SEDAN')
    else: return('OTHERS')



In [5]:
P_df['STYLE'] = P_df['VEH_INFO'].str.split(' ').apply(lambda x: str(x).upper()).apply(lambda x: VEH_STYLE_FN(x))
P_df

Unnamed: 0,VEH_INFO,Value,STYLE
0,Coupe XP 4S,100,COUPE
1,Sedan XLT 4cylinder,200,SEDAN
2,PICKUP EXTRA,300,PICKUP
3,SEDAN XL LT,400,SEDAN
4,COUPE 4_cyl SE,500,COUPE
5,pickup XST,600,PICKUP
6,Wagon SET 4wd,700,OTHERS


### Example 2: Another example of extracting information from one string column and assign value to a new column

In [6]:
# Let's add more columns to P_df:
P_df_ex2 = P_df.copy()
P_df_ex2['VIN'] = ['2T1BU12348', '1F1BU45613','2T1BU12789','ZZ1BU1206Z','2T1BU12832','2T1BU12806','2T1BU12555']
P_df_ex2

Unnamed: 0,VEH_INFO,Value,STYLE,VIN
0,Coupe XP 4S,100,COUPE,2T1BU12348
1,Sedan XLT 4cylinder,200,SEDAN,1F1BU45613
2,PICKUP EXTRA,300,PICKUP,2T1BU12789
3,SEDAN XL LT,400,SEDAN,ZZ1BU1206Z
4,COUPE 4_cyl SE,500,COUPE,2T1BU12832
5,pickup XST,600,PICKUP,2T1BU12806
6,Wagon SET 4wd,700,OTHERS,2T1BU12555


In [7]:
# Define a function to allocate the vehicle to grps based on the last two digits in the 'fake' VIN

## Here is the logic: 
## If the second to last digit is 0,1 --> Ctrl group
## For the rest vehicles, if last digit is 0-3 --> Group 1, if 4-6 --> Group 2, else, Group 3

def VIN_random_fn(DAT):
    vin = DAT
    if int(vin[-2]) in [0,1]:
        output = 'Ctrl'
    elif int(vin[-2]) in [2,3,4,5,6,7,8,9]:
        if int(vin[-1]) in [0,1,2,3]:
            output = 'Grp_1'
        elif int(vin[-1]) in [4,5,6]:
            output = 'Grp_2'
        else:
            output = 'Grp_3'
    return output

In [8]:
# First of all, we wanted to eliminate if the last digit is alphabet, not a number:

P_df_ex2 = P_df_ex2[P_df_ex2['VIN'].astype(str).map(lambda dat: dat[-1] in ['1','2','3','4','5','6','7','8','9','0'])].copy()

In [9]:
# Then, let's create a new column called "Test_cell" for grouping:

P_df_ex2['Test_cell'] = P_df_ex2['VIN'].astype(str).apply(lambda dat: VIN_random_fn(dat))

In [10]:
P_df_ex2

Unnamed: 0,VEH_INFO,Value,STYLE,VIN,Test_cell
0,Coupe XP 4S,100,COUPE,2T1BU12348,Grp_3
1,Sedan XLT 4cylinder,200,SEDAN,1F1BU45613,Ctrl
2,PICKUP EXTRA,300,PICKUP,2T1BU12789,Grp_3
4,COUPE 4_cyl SE,500,COUPE,2T1BU12832,Grp_1
5,pickup XST,600,PICKUP,2T1BU12806,Ctrl
6,Wagon SET 4wd,700,OTHERS,2T1BU12555,Grp_2


## Time Variable Issue

Time variable sometimes is tricky to handle. So let's take some examples

### Basic codes and handling bi-week indicator

In [11]:
# Create the dataframe to handle:

Time_df = P_df_ex2.copy()
Time_df['DT'] = ['2020-01-21','2020-02-25','2021-03-01','2021-08-18','2022-02-28','2022-05-08']
Time_df

Unnamed: 0,VEH_INFO,Value,STYLE,VIN,Test_cell,DT
0,Coupe XP 4S,100,COUPE,2T1BU12348,Grp_3,2020-01-21
1,Sedan XLT 4cylinder,200,SEDAN,1F1BU45613,Ctrl,2020-02-25
2,PICKUP EXTRA,300,PICKUP,2T1BU12789,Grp_3,2021-03-01
4,COUPE 4_cyl SE,500,COUPE,2T1BU12832,Grp_1,2021-08-18
5,pickup XST,600,PICKUP,2T1BU12806,Ctrl,2022-02-28
6,Wagon SET 4wd,700,OTHERS,2T1BU12555,Grp_2,2022-05-08


In [12]:
# Basic codes:

## After the running the following codes: 'year' and 'month' will reflect the actual date of the observations
## week is the number of week in that particular year. e.g. 2021-08-18 is in the 33rd week of 2021
## biweek is the number of bi-weeks since the first day of observation (in this case: 2020-01-21). e.g. for 2021-08-18, it is 2*33 = 66 weeks away from 2020-01-21

Time_df['DT'] = pd.to_datetime(Time_df['DT'], format = '%Y-%m-%d')
Time_df['year'] = Time_df['DT'].dt.strftime('%Y').astype(int)
Time_df['month'] = Time_df['DT'].dt.strftime('%m').astype(int)
Time_df['day'] = Time_df['DT'].dt.strftime('%d').astype(int)
Time_df['week'] = Time_df['DT'].dt.strftime('%V').astype(int)

Time_df['biweek'] = ((Time_df['DT'] - Time_df['DT'].min()).dt.days // 14 +1).astype(int)

Time_df

Unnamed: 0,VEH_INFO,Value,STYLE,VIN,Test_cell,DT,year,month,day,week,biweek
0,Coupe XP 4S,100,COUPE,2T1BU12348,Grp_3,2020-01-21,2020,1,21,4,1
1,Sedan XLT 4cylinder,200,SEDAN,1F1BU45613,Ctrl,2020-02-25,2020,2,25,9,3
2,PICKUP EXTRA,300,PICKUP,2T1BU12789,Grp_3,2021-03-01,2021,3,1,9,29
4,COUPE 4_cyl SE,500,COUPE,2T1BU12832,Grp_1,2021-08-18,2021,8,18,33,42
5,pickup XST,600,PICKUP,2T1BU12806,Ctrl,2022-02-28,2022,2,28,9,55
6,Wagon SET 4wd,700,OTHERS,2T1BU12555,Grp_2,2022-05-08,2022,5,8,18,60


In [13]:
Time_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6 entries, 0 to 6
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   VEH_INFO   6 non-null      object        
 1   Value      6 non-null      int64         
 2   STYLE      6 non-null      object        
 3   VIN        6 non-null      object        
 4   Test_cell  6 non-null      object        
 5   DT         6 non-null      datetime64[ns]
 6   year       6 non-null      int64         
 7   month      6 non-null      int64         
 8   day        6 non-null      int64         
 9   week       6 non-null      int64         
 10  biweek     6 non-null      int64         
dtypes: datetime64[ns](1), int64(6), object(4)
memory usage: 576.0+ bytes
