## 1. import necessary libraries

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Read Dataset using `pd.read_csv`

In [36]:
df = pd.read_csv("../../DataSet/rond.ir/rond.ir_full.csv")

## 3. Review the Data

### 3.1 Display first 5 rows of Dataset

In [37]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 594 32 72,16600000,صفر,تهران,23 ثانیه
1,0912 36 57 259,17100000,صفر,تهران,23 ثانیه
2,0912 796 21 54,9700000,صفر,تهران,23 ثانیه
3,0912 051 91 73,7500000,صفر,تهران,23 ثانیه
4,0912 350 52 63,25000000,صفر,تهران,23 ثانیه


### 3.2 Check the number of samples and features

In [38]:
df.shape

(630040, 5)

### 3.3 Check Data types and memory usage

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630040 entries, 0 to 630039
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   phone_number  630040 non-null  object
 1   price         630040 non-null  object
 2   status        630040 non-null  object
 3   city          630040 non-null  object
 4   time          623251 non-null  object
dtypes: object(5)
memory usage: 24.0+ MB


### 3.4 Drop rows with Null values

In [40]:
df.isnull().sum()

phone_number       0
price              0
status             0
city               0
time            6789
dtype: int64

In [41]:
null_counts = df.isnull().sum()
null_percent = (null_counts / df.shape[0]) * 100
for index , value in null_percent.items():
    print(f'Column {index} has {value}% missing data.')

Column phone_number has 0.0% missing data.
Column price has 0.0% missing data.
Column status has 0.0% missing data.
Column city has 0.0% missing data.
Column time has 1.0775506317059234% missing data.


In [42]:
df.dropna(inplace=True)

In [43]:
df.shape

(623251, 5)

### 3.5 Drop duplicated rows

In [44]:
df.duplicated().sum()

49998

In [45]:
df.drop_duplicates(inplace=True)

In [46]:
df.shape

(573253, 5)

## 4. Pre Process Dataset

### 4.1 `price` column

In [47]:
def process_price(price):
    price = price.replace(',' , '')
    if price.isdigit():
        return float(price)
    return np.nan

In [48]:
df.price = df.price.apply(process_price)

In [49]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 594 32 72,16600000.0,صفر,تهران,23 ثانیه
1,0912 36 57 259,17100000.0,صفر,تهران,23 ثانیه
2,0912 796 21 54,9700000.0,صفر,تهران,23 ثانیه
3,0912 051 91 73,7500000.0,صفر,تهران,23 ثانیه
4,0912 350 52 63,25000000.0,صفر,تهران,23 ثانیه


In [50]:
df.isnull().sum()

phone_number         0
price           171855
status               0
city                 0
time                 0
dtype: int64

### 30571 records did'nt have valid prices

In [51]:
df.dropna(inplace=True)

In [52]:
df.shape

(401398, 5)

### 4.2 `status` column

In [53]:
status_values = df.status.unique()
status_values

array(['صفر', 'در حد صفر', 'کارکرده'], dtype=object)

In [54]:
#status_values[1] , status_values[2] = status_values[2] , status_values[1]

In [55]:
status_map = {status:index for index , status in enumerate(status_values)}
status_map

{'صفر': 0, 'در حد صفر': 1, 'کارکرده': 2}

In [56]:
df.status = df.status.map(status_map)

In [57]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 594 32 72,16600000.0,0,تهران,23 ثانیه
1,0912 36 57 259,17100000.0,0,تهران,23 ثانیه
2,0912 796 21 54,9700000.0,0,تهران,23 ثانیه
3,0912 051 91 73,7500000.0,0,تهران,23 ثانیه
4,0912 350 52 63,25000000.0,0,تهران,23 ثانیه


### 4.3 `city` column

#### 4.3.1 Check city names and their size

In [58]:
cities = df.city.unique()
print(f'We have {len(cities)} cities in data.')

We have 239 cities in data.


In [59]:
cities

array(['تهران', 'اصفهان', 'شيراز', 'جهرم', 'مشهد', 'همدان', 'ساری', 'رشت',
       'گرگان', 'شمیرانات', 'آمل', 'کرج', 'اسلام شهر', 'قم', 'سمنان',
       'تبريز', 'آبيک', 'ايلام', 'کرمان', 'ورامين', 'يزد', 'کرمانشاه',
       'نظرآباد', 'شهريار', 'گچساران', 'سنندج', 'کاشان', 'شهرضا', 'زنجان',
       'قشم', 'اهواز', 'بابل', 'پیشوا', 'رباط کريم', 'شهرکرد', 'اردبيل',
       'ابهر', 'سلماس', 'رفسنجان', 'کازرون', 'مرودشت', 'بندر عباس',
       'گناوه', 'آباده', 'بندر ماهشهر', 'قزوين', 'اروميه', 'دماوند',
       'برخوار', 'زاهدان', 'بوشهر', 'بيرجند', 'بجنورد', 'آبادان',
       'اليگودرز', 'گنبد کاووس', 'نجف آباد', 'بناب', 'آران و بيدگل',
       'جيرفت', 'زرند', 'خوي', 'دهاقان', 'سقز', 'ری', 'ميانه', 'اراک',
       'گراش', 'گناباد', 'بابلسر', 'نائين', 'شاهين شهر و میمه', 'مبارکه',
       'سميرم', 'خمينی شهر', 'چادگان', 'گلپايگان', 'خوانسار', 'خرمشهر',
       'دزفول', 'بانه', 'مريوان', 'قدس', 'بم', 'دامغان', 'تاکستان',
       'اسلام آباد غرب', 'ارزوئیه', 'ميبد', 'اسدآباد', 'سرايان',
       'بینالو

#### 4.3.2 Encode city names using `category_encoders.BinaryEncoder` 

In [60]:
import category_encoders as ce

In [61]:
binary_encoder = ce.BinaryEncoder(cols=['city'])
df = binary_encoder.fit_transform(df)

In [62]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time
0,0912 594 32 72,16600000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
1,0912 36 57 259,17100000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
2,0912 796 21 54,9700000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
3,0912 051 91 73,7500000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
4,0912 350 52 63,25000000.0,0,0,0,0,0,0,0,0,1,23 ثانیه


### 4.4 `time` column

In [63]:
times = df.time

In [64]:
times

0           23 ثانیه
1           23 ثانیه
2           23 ثانیه
3           23 ثانیه
4           23 ثانیه
             ...    
623552    39 روز پیش
623553    39 روز پیش
623556    39 روز پیش
623557    39 روز پیش
623558    40 روز پیش
Name: time, Length: 401398, dtype: object

#### 4.4.1 Define `regex` patterns for splitting digits and words

In [65]:
import re
pattern = r'(\d+) ([^\d]+)'

In [66]:
res = [re.findall(pattern , text)[0] for text in times]
res[:5]

[('23', 'ثانیه'),
 ('23', 'ثانیه'),
 ('23', 'ثانیه'),
 ('23', 'ثانیه'),
 ('23', 'ثانیه')]

#### 4.4.2 Extract time units

In [67]:
time_units = set()
for units in res:
    time_units.add(units[1])
time_units = list(time_units)
time_units

['ساعت', 'دقیقه', 'روز پیش', 'ثانیه']

In [68]:
time_units[1]

'دقیقه'

#### 4.4.3 Convert all `time_units` to seconds

In [69]:
time_units_map = {
    time_units[3]: 1 ,
    time_units[2]:24 * 60 * 60 ,
    time_units[1]: 60 ,
    time_units[0]: 60 * 60
}

In [70]:
preProcessed_time = [int(time_expr[0]) * time_units_map[time_expr[1]] for time_expr in res]
preProcessed_time[:5]

[23, 23, 23, 23, 23]

In [71]:
df.time = preProcessed_time

In [72]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time
0,0912 594 32 72,16600000.0,0,0,0,0,0,0,0,0,1,23
1,0912 36 57 259,17100000.0,0,0,0,0,0,0,0,0,1,23
2,0912 796 21 54,9700000.0,0,0,0,0,0,0,0,0,1,23
3,0912 051 91 73,7500000.0,0,0,0,0,0,0,0,0,1,23
4,0912 350 52 63,25000000.0,0,0,0,0,0,0,0,0,1,23


### 4.5 `phone_number` column

In [73]:
phone_numbers = df.phone_number.astype(str)

In [74]:
def preprocess_phone(num):
    num = num.replace(" ", "")
    if not num.isdigit():
        return np.nan
    if num[0] == '0': num = num[1:]
    if len(num) == 10:
        return num
    return np.nan

In [75]:
phone_numbers = phone_numbers.apply(preprocess_phone)

In [76]:
df.phone_number = phone_numbers

In [77]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,1,23
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,1,23
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,1,23
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,1,23
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,1,23


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 401398 entries, 0 to 623558
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   phone_number  400988 non-null  object 
 1   price         401398 non-null  float64
 2   status        401398 non-null  int64  
 3   city_0        401398 non-null  int64  
 4   city_1        401398 non-null  int64  
 5   city_2        401398 non-null  int64  
 6   city_3        401398 non-null  int64  
 7   city_4        401398 non-null  int64  
 8   city_5        401398 non-null  int64  
 9   city_6        401398 non-null  int64  
 10  city_7        401398 non-null  int64  
 11  time          401398 non-null  int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 39.8+ MB


## 5. Drop rows with `NaN` values

In [79]:
df.isnull().sum()

phone_number    410
price             0
status            0
city_0            0
city_1            0
city_2            0
city_3            0
city_4            0
city_5            0
city_6            0
city_7            0
time              0
dtype: int64

In [80]:
df.dropna(inplace=True)

In [81]:
df.isnull().sum()

phone_number    0
price           0
status          0
city_0          0
city_1          0
city_2          0
city_3          0
city_4          0
city_5          0
city_6          0
city_7          0
time            0
dtype: int64

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400988 entries, 0 to 623558
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   phone_number  400988 non-null  object 
 1   price         400988 non-null  float64
 2   status        400988 non-null  int64  
 3   city_0        400988 non-null  int64  
 4   city_1        400988 non-null  int64  
 5   city_2        400988 non-null  int64  
 6   city_3        400988 non-null  int64  
 7   city_4        400988 non-null  int64  
 8   city_5        400988 non-null  int64  
 9   city_6        400988 non-null  int64  
 10  city_7        400988 non-null  int64  
 11  time          400988 non-null  int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 39.8+ MB


## 6. Feature Extraction

### 6.1 Area code

### 6.1.1 Count unique area codes

In [83]:
area_code = set()
for num in df.phone_number:
    area_code.add(num[:3])
area_code_lst = list(area_code)
print(f"We have {len(area_code)} different area codes in data.")

We have 34 different area codes in data.


### 6.1.2 Encode area codes

In [84]:
df['area_code'] = df.phone_number.str[:3]
area_code_bin_enc = ce.BinaryEncoder(cols=['area_code'])
df = area_code_bin_enc.fit_transform(df)

In [85]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time,area_code_0,area_code_1,area_code_2,area_code_3,area_code_4,area_code_5
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1


### 6.2 Extract features from `phone_number` column

In [86]:
def chack_last_stair(num):
    return (num[6] == num[8]) or (num[7] == num[9])

def check_first_stair(num):
    return (num[3] == num[5]) or (num[4] == num[6])

def check_mirror_4_first(num):
    return (num[3] == num[6]) and (num[4] == num[5])

def check_mirror_4_last(num):
    return (num[7] == num[8]) and (num[6] == num[9])

def check_mirror_3(num):
    return (num[3] == num[9]) and (num[4] == num[8]) and (num[5] == num[7])

def rep_3_first_last(num):
    return (num[3] == num[7]) and (num[4] == num[8]) and (num[5] == num[9])

def ordered_first_3(num):
    return (int(num[4]) - int(num[3]) == 1) and (int(num[5]) - int(num[4]) == 1)

def ordered_last_3(num):
    return (int(num[8]) - int(num[7]) == 1) and (int(num[9]) - int(num[8]) == 1)

def two_couple_first(num):
    return (num[3] == num[5]) and (num[4] == num[6])

def two_couple_last(num):
    return (num[6] == num[8]) and (num[7] == num[9])

def three_couple_first(num):
    return (num[3] == num[5] == num[7]) and (num[4] == num[6] == num[8])

def three_couple_last(num):
    return (num[4] == num[6] == num[8]) and (num[5] == num[7] == num[9])

def mul_ten_first(num):
    return (num[4] == "0") and (num[6] == "0")

def mul_ten_last(num):
    return (num[7] == "0") and (num[9] == "0")

def mul_thousand_first(num):
    return (num[4] == "0") and (num[5] == "0")

def mul_thousand_last(num):
    return (num[7] == "0") and (num[8] == "0")

def mul_ten_thousand_first(num):
    return (num[5] == "0") and (num[6] == "0") and (num[7] == "0")

def mul_ten_thousand_last(num):
    return (num[7] == "0") and (num[8] == "0") and (num[9] == "0")

def million(num):
    mil_index = num.find("00000")
    if mil_index == 4 or mil_index == 5:
        return True
    return False

def three_rep_first(num):
    return (num[3] == num[4]) and (num[4] == num[5])

def three_rep_mid(num):
    return (num[5] == num[6]) and (num[6] == num[7])

def three_rep_last(num):
    return (num[7] == num[8]) and (num[8] == num[9])


def four_rep_first(num):
    return (num[3] == num[4]) and (num[4] == num[5]) and (num[5] == num[6])

def four_rep_mid(num):
    return (num[4] == num[5]) and (num[5] == num[6]) and (num[6] == num[7])

def four_rep_last(num):
    return (num[6] == num[7]) and (num[7] == num[8]) and (num[8] == num[9])


def five_rep_first(num):
    return (num[3] == num[4]) and (num[4] == num[5]) and (num[5] == num[6]) and (num[6] == num[7])

def five_rep_mid(num):
    return (num[4] == num[5]) and (num[5] == num[6]) and (num[6] == num[7]) and (num[7] == num[8])

def five_rep_last(num):
    return (num[5] == num[6]) and (num[6] == num[7]) and (num[7] == num[8]) and (num[8] == num[9])

def six_rep_first(num):
    return (num[3] == num[4]) and (num[4] == num[5]) and (num[5] == num[6]) and (num[6] == num[7]) and (num[7] == num[8])

def six_rep_last(num):
    return (num[4] == num[5]) and (num[5] == num[6]) and (num[6] == num[7]) and (num[7] == num[8]) and (num[8] == num[9])

def all_same(num):
    return (num[3] == num[4]) and (num[4] == num[5]) and (num[5] == num[6]) and (num[6] == num[7]) and (num[7] == num[8]) and (num[8] == num[9])

def two_digit(num):
    return len(set(num[3:])) == 2

def two_rep_first(num):
    return (num[3] == num[4]) and (num[5] == num[6])

def two_rep_last(num):
    return (num[6] == num[7]) and (num[8] == num[9])

def has_birthdate_first(num):
    date = int(num[3:7])
    return 1300 < date < 1400

def has_birthdate_last(num):
    date = int(num[6:])
    return 1300 < date < 1400


def has_area_code(num):
    area_code = num[:3]
    return num[3:].find(area_code) != -1 

def extract_features(df):
    features = [chack_last_stair , check_first_stair , check_mirror_4_first , check_mirror_4_last , check_mirror_3,
                rep_3_first_last , ordered_first_3 , ordered_last_3 , two_couple_first , two_couple_last , three_couple_first,
                three_couple_last , mul_ten_first , mul_ten_last , mul_thousand_first , mul_thousand_last,
                mul_ten_thousand_first , mul_ten_thousand_last , million , three_rep_first , three_rep_mid , three_rep_last,
                four_rep_first , four_rep_mid , four_rep_last , five_rep_first , five_rep_mid , five_rep_last,
                six_rep_first , six_rep_last , all_same , two_digit , two_rep_first , two_rep_last , has_birthdate_first,
                has_birthdate_last , has_area_code]
    
    for feature_func in features:
        df[feature_func.__name__] = df.phone_number.apply(feature_func)
        

extract_features(df)

In [87]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,...,five_rep_last,six_rep_first,six_rep_last,all_same,two_digit,two_rep_first,two_rep_last,has_birthdate_first,has_birthdate_last,has_area_code
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400988 entries, 0 to 623558
Data columns (total 55 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   phone_number            400988 non-null  object 
 1   price                   400988 non-null  float64
 2   status                  400988 non-null  int64  
 3   city_0                  400988 non-null  int64  
 4   city_1                  400988 non-null  int64  
 5   city_2                  400988 non-null  int64  
 6   city_3                  400988 non-null  int64  
 7   city_4                  400988 non-null  int64  
 8   city_5                  400988 non-null  int64  
 9   city_6                  400988 non-null  int64  
 10  city_7                  400988 non-null  int64  
 11  time                    400988 non-null  int64  
 12  area_code_0             400988 non-null  int64  
 13  area_code_1             400988 non-null  int64  
 14  area_code_2         

### 6.3 To let model learn complex patterns in phone numbers we split each phone number into two groups `first_three` and `last_four`

### 6.3.1 `first_three` columns

In [89]:
first_three_0 = df.phone_number.apply(lambda phone: int(phone[3]))
first_three_1 = df.phone_number.apply(lambda phone: int(phone[4]))
first_three_2 = df.phone_number.apply(lambda phone: int(phone[5]))
df['first_three_0'] = first_three_0
df['first_three_1'] = first_three_1
df['first_three_2'] = first_three_2

### 6.3.2 `last_four` columns

In [90]:
last_four_0 = df.phone_number.apply(lambda phone: int(phone[6]))
last_four_1 = df.phone_number.apply(lambda phone: int(phone[7]))
last_four_2 = df.phone_number.apply(lambda phone: int(phone[8]))
last_four_3 = df.phone_number.apply(lambda phone: int(phone[9]))
df['last_four_0'] = last_four_0
df['last_four_1'] = last_four_1
df['last_four_2'] = last_four_2
df['last_four_3'] = last_four_3

In [91]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,...,has_birthdate_first,has_birthdate_last,has_area_code,first_three_0,first_three_1,first_three_2,last_four_0,last_four_1,last_four_2,last_four_3
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,...,False,False,False,5,9,4,3,2,7,2
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,...,False,False,False,3,6,5,7,2,5,9
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,...,False,False,False,7,9,6,2,1,5,4
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,...,False,False,False,0,5,1,9,1,7,3
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,...,False,False,False,3,5,0,5,2,6,3


## 7. Save the Pre Processed Dataset

In [92]:
df.to_csv('../../DataSet/rond.ir_full_preprocessed.csv' , index=False)