## 1. import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 2. Read Dataset using `pd.read_csv`

In [2]:
df = pd.read_csv("../../DataSet/rond.ir/rond.ir_full.csv")

## 3. Review the Data

### 3.1 Display first 5 rows of Dataset

In [3]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 594 32 72,16600000,صفر,تهران,23 ثانیه
1,0912 36 57 259,17100000,صفر,تهران,23 ثانیه
2,0912 796 21 54,9700000,صفر,تهران,23 ثانیه
3,0912 051 91 73,7500000,صفر,تهران,23 ثانیه
4,0912 350 52 63,25000000,صفر,تهران,23 ثانیه


### 3.2 Check the number of samples and features

In [4]:
df.shape

(630040, 5)

### 3.3 Check Data types and memory usage

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 630040 entries, 0 to 630039
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   phone_number  630040 non-null  object
 1   price         630040 non-null  object
 2   status        630040 non-null  object
 3   city          630040 non-null  object
 4   time          623251 non-null  object
dtypes: object(5)
memory usage: 24.0+ MB


### 3.4 Drop rows with Null values

In [6]:
df.isnull().sum()

phone_number       0
price              0
status             0
city               0
time            6789
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.shape

(623251, 5)

### 3.5 Drop duplicated rows

In [9]:
df.duplicated().sum()

49998

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df.shape

(573253, 5)

## 4. Pre Process Dataset

### 4.1 `price` column

In [12]:
def process_price(price):
    price = price.replace(',' , '')
    if price.isdigit():
        return int(price)
    return np.nan

In [13]:
df.price = df.price.apply(process_price)

In [14]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 594 32 72,16600000.0,صفر,تهران,23 ثانیه
1,0912 36 57 259,17100000.0,صفر,تهران,23 ثانیه
2,0912 796 21 54,9700000.0,صفر,تهران,23 ثانیه
3,0912 051 91 73,7500000.0,صفر,تهران,23 ثانیه
4,0912 350 52 63,25000000.0,صفر,تهران,23 ثانیه


In [15]:
df.isnull().sum()

phone_number         0
price           171855
status               0
city                 0
time                 0
dtype: int64

### 30571 records did'nt have valid prices

In [16]:
df.dropna(inplace=True)

In [17]:
df.shape

(401398, 5)

### 4.2 `status` column

In [18]:
status_values = df.status.unique()

In [19]:
status_values

array(['صفر', 'در حد صفر', 'کارکرده'], dtype=object)

In [20]:
#status_values[1] , status_values[2] = status_values[2] , status_values[1]

In [22]:
status_map = {status:index for index , status in enumerate(status_values)}

In [23]:
status_map

{'صفر': 0, 'در حد صفر': 1, 'کارکرده': 2}

In [24]:
df.status = df.status.map(status_map)

In [25]:
df.head()

Unnamed: 0,phone_number,price,status,city,time
0,0912 594 32 72,16600000.0,0,تهران,23 ثانیه
1,0912 36 57 259,17100000.0,0,تهران,23 ثانیه
2,0912 796 21 54,9700000.0,0,تهران,23 ثانیه
3,0912 051 91 73,7500000.0,0,تهران,23 ثانیه
4,0912 350 52 63,25000000.0,0,تهران,23 ثانیه


### 4.3 `city` column

#### 4.3.1 Check city names and their size

In [26]:
cities = df.city.unique()

In [27]:
len(cities)

239

In [28]:
cities

array(['تهران', 'اصفهان', 'شيراز', 'جهرم', 'مشهد', 'همدان', 'ساری', 'رشت',
       'گرگان', 'شمیرانات', 'آمل', 'کرج', 'اسلام شهر', 'قم', 'سمنان',
       'تبريز', 'آبيک', 'ايلام', 'کرمان', 'ورامين', 'يزد', 'کرمانشاه',
       'نظرآباد', 'شهريار', 'گچساران', 'سنندج', 'کاشان', 'شهرضا', 'زنجان',
       'قشم', 'اهواز', 'بابل', 'پیشوا', 'رباط کريم', 'شهرکرد', 'اردبيل',
       'ابهر', 'سلماس', 'رفسنجان', 'کازرون', 'مرودشت', 'بندر عباس',
       'گناوه', 'آباده', 'بندر ماهشهر', 'قزوين', 'اروميه', 'دماوند',
       'برخوار', 'زاهدان', 'بوشهر', 'بيرجند', 'بجنورد', 'آبادان',
       'اليگودرز', 'گنبد کاووس', 'نجف آباد', 'بناب', 'آران و بيدگل',
       'جيرفت', 'زرند', 'خوي', 'دهاقان', 'سقز', 'ری', 'ميانه', 'اراک',
       'گراش', 'گناباد', 'بابلسر', 'نائين', 'شاهين شهر و میمه', 'مبارکه',
       'سميرم', 'خمينی شهر', 'چادگان', 'گلپايگان', 'خوانسار', 'خرمشهر',
       'دزفول', 'بانه', 'مريوان', 'قدس', 'بم', 'دامغان', 'تاکستان',
       'اسلام آباد غرب', 'ارزوئیه', 'ميبد', 'اسدآباد', 'سرايان',
       'بینالو

#### 4.3.2 Encode city names using `category_encoders.BinaryEncoder` 

In [29]:
import category_encoders as ce

In [30]:
binary_encoder = ce.BinaryEncoder(cols=['city'])
df = binary_encoder.fit_transform(df)

In [31]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time
0,0912 594 32 72,16600000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
1,0912 36 57 259,17100000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
2,0912 796 21 54,9700000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
3,0912 051 91 73,7500000.0,0,0,0,0,0,0,0,0,1,23 ثانیه
4,0912 350 52 63,25000000.0,0,0,0,0,0,0,0,0,1,23 ثانیه


### 4.4 `time` column

In [32]:
times = df.time

In [33]:
times

0           23 ثانیه
1           23 ثانیه
2           23 ثانیه
3           23 ثانیه
4           23 ثانیه
             ...    
623552    39 روز پیش
623553    39 روز پیش
623556    39 روز پیش
623557    39 روز پیش
623558    40 روز پیش
Name: time, Length: 401398, dtype: object

#### 4.4.1 Define `regex` patterns for splitting digits and words

In [34]:
import re
pattern = r'(\d+) ([^\d]+)'

In [35]:
res = [re.findall(pattern , text)[0] for text in times]

In [36]:
res[:5]

[('23', 'ثانیه'),
 ('23', 'ثانیه'),
 ('23', 'ثانیه'),
 ('23', 'ثانیه'),
 ('23', 'ثانیه')]

#### 4.4.2 Extract time units

In [37]:
time_units = set()
for units in res:
    time_units.add(units[1])

In [38]:
time_units = list(time_units)
time_units

['ساعت', 'ثانیه', 'دقیقه', 'روز پیش']

In [39]:
time_units[1]

'ثانیه'

#### 4.4.3 Convert all `time_units` to seconds

In [40]:
time_units_map = {
    time_units[3]:24*60*60 ,
    time_units[2]:60 ,
    time_units[1]: 1 ,
    time_units[0]:60 * 60
}

In [41]:
preProcessed_time = [int(time_expr[0]) * time_units_map[time_expr[1]] for time_expr in res]

In [42]:
preProcessed_time[:5]

[23, 23, 23, 23, 23]

In [43]:
df.time = preProcessed_time

In [44]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time
0,0912 594 32 72,16600000.0,0,0,0,0,0,0,0,0,1,23
1,0912 36 57 259,17100000.0,0,0,0,0,0,0,0,0,1,23
2,0912 796 21 54,9700000.0,0,0,0,0,0,0,0,0,1,23
3,0912 051 91 73,7500000.0,0,0,0,0,0,0,0,0,1,23
4,0912 350 52 63,25000000.0,0,0,0,0,0,0,0,0,1,23


### 4.5 `phone_number` column

In [45]:
phone_numbers = df.phone_number.astype(str)

In [46]:
def preprocess_phone(num):
    num = num.replace(" ", "")
    if not num.isdigit():
        return np.nan
    if num[0] == '0': num = num[1:]
    if len(num) == 10:
        return num
    return np.nan

In [47]:
phone_numbers = phone_numbers.apply(preprocess_phone)

In [48]:
df.phone_number = phone_numbers

In [49]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,1,23
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,1,23
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,1,23
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,1,23
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,1,23


## 5. Drop rows with `NaN` values

In [50]:
df.isnull().sum()

phone_number    410
price             0
status            0
city_0            0
city_1            0
city_2            0
city_3            0
city_4            0
city_5            0
city_6            0
city_7            0
time              0
dtype: int64

In [51]:
df.dropna(inplace=True)

In [52]:
df.isnull().sum()

phone_number    0
price           0
status          0
city_0          0
city_1          0
city_2          0
city_3          0
city_4          0
city_5          0
city_6          0
city_7          0
time            0
dtype: int64

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400988 entries, 0 to 623558
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   phone_number  400988 non-null  object 
 1   price         400988 non-null  float64
 2   status        400988 non-null  int64  
 3   city_0        400988 non-null  int64  
 4   city_1        400988 non-null  int64  
 5   city_2        400988 non-null  int64  
 6   city_3        400988 non-null  int64  
 7   city_4        400988 non-null  int64  
 8   city_5        400988 non-null  int64  
 9   city_6        400988 non-null  int64  
 10  city_7        400988 non-null  int64  
 11  time          400988 non-null  int64  
dtypes: float64(1), int64(10), object(1)
memory usage: 39.8+ MB


## 6. Feature Extraction

### 6.1 Area code

### 6.1.1 Count unique area codes

In [54]:
area_code = set()
for num in df.phone_number:
    area_code.add(num[:3])
area_code_lst = list(area_code)

In [55]:
len(area_code_lst)

34

### 6.1.2 Encode area codes

In [56]:
df['area_code'] = df.phone_number.str[:3]
area_code_bin_enc = ce.BinaryEncoder(cols=['area_code'])
df = area_code_bin_enc.fit_transform(df)

In [57]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,city_7,time,area_code_0,area_code_1,area_code_2,area_code_3,area_code_4,area_code_5
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,1,23,0,0,0,0,0,1


### 6.2 ٍExtract features from `phone_number` column

In [58]:
def number_features(df):
    
    #last 7 number all same
    def allCharactersSame(s) :
        n = len(s)
        for i in range(1, n) :
            if s[i] != s[0] :
                return False
        return True
    df['last_7_sim'] = [1 if allCharactersSame(num[3:]) else 0 for num in df['phone_number']]
    
    #last 6 number all same
    df['last_6_sim'] = [1 if (allCharactersSame(num[4:]) and num[3]!=num[4]) else 0 for num in df['phone_number']]
    
    #first 6 number all same
    df['first_6_sim'] = [1 if (allCharactersSame(num[3:9]) and num[8]!=num[9]) else 0 for num in df['phone_number']]
    
    #3 pair_last
    df['three_pair_last'] = [1 if ((num[4:6]==num[6:8]) and (num[4:6]==num[8:10])) else 0 for num in df['phone_number']]
    
    #3 pair_first
    df['three_pair_first'] = [1 if ((num[3:5]==num[5:7]) and (num[3:5]==num[7:9])) else 0 for num in df['phone_number']]
    
    #just two number include
    def check_two_num(s):
        my_set = set()
        for i in s:
            my_set.add(i)
        if(len(list(my_set)) == 2):
            return True
        return False
    df['just_two_num'] = [1 if check_two_num(num[3:]) else 0 for num in df['phone_number']]
    
    #last 5 number all same
    df['last_5_sim'] = [1 if (allCharactersSame(num[5:]) and num[4]!=num[5]) else 0 for num in df['phone_number']]
    
    #first 5 number all same
    df['first_5_sim'] = [1 if (allCharactersSame(num[3:8]) and num[7]!=num[8]) else 0 for num in df['phone_number']]
    
    #double hundreds
    pattern_hundred = r'([1-9]00){2}'
    df['double_100'] = [1 if (len(re.findall(pattern_hundred , num[4:10]))==1 or
                        len(re.findall(pattern_hundred , num[3:9]))==1) else 0 for num in df['phone_number']]
    
    #two triple pair like '532 532'
    df['two_triple_pair'] = [1 if ((num[6:9]==num[3:6]) or (num[4:7]==num[7:10])) else 0 for num in df['phone_number']]
    
    #first 1000
    df['first_1000'] = [1 if ((num[4:6]=='00' and num[3]!='0') or (num[5:7]=='00') and num[4]!='0') else 0 for num in df['phone_number']]
    
    #last 1000
    df['last_1000'] = [1 if ((num[7:9]=='00' and num[6]!='0') or (num[8:10]=='00')) else 0 for num in df['phone_number']]
    
    #last 4 number all same
    df['last_4_sim'] = [1 if (allCharactersSame(num[6:]) and num[5]!=num[6]) else 0 for num in df['phone_number']]
    
    #first 4 number all same
    df['first_4_sim'] = [1 if (allCharactersSame(num[3:7]) and num[6]!=num[7]) else 0 for num in df['phone_number']]
    
    #middle 5 number all same
    df['middle_5_sim'] = [1 if (allCharactersSame(num[4:9]) and num[3]!=num[4] and num[8]!=num[9]) else 0 for num in df['phone_number']]
    
    #last double ten like '4090'
    pattern_double_10 = r'([1-9]0){2,3}'
    df['last_double_10'] = [1 if (len(re.findall(pattern_double_10 , num[4:]))==1 and num[-1]=='0') else 0 for num in df['phone_number']]
    
    #last double ten like '4090'
    pattern_double_10 = r'([1-9]0){2,3}'
    df['first_double_10'] = [1 if (len(re.findall(pattern_double_10 , num[3:9]))==1 and num[4]=='0') else 0 for num in df['phone_number']]
    
    #last two pair
    df['last_two_pair'] = [1 if (num[6:8]==num[8:10] and num[6]!=num[7]) else 0 for num in df['phone_number']]
    
    #first two pair
    df['first_two_pair'] = [1 if (num[3:5]==num[5:7] and num[3]!=num[4]) else 0 for num in df['phone_number']]
    
    #last 3 number all same
    df['last_3_sim'] = [1 if allCharactersSame(num[7:]) and num[6]!=num[7] else 0 for num in df['phone_number']]
    
    #first 3 number all same
    df['first_3_sim'] = [1 if (allCharactersSame(num[3:6]) and num[5]!=num[6]) else 0 for num in df['phone_number']]
    
    #repeat area code
    def check_area_code(s):
        for area_code in area_code_lst:
            if area_code in s:
                return True
        return False
    df['area_code_repeat'] = [1 if check_area_code(num[3:]) else 0 for num in df['phone_number']]
    
    #weighted phone number
    df['wheighted_num'] = [1 if (num[3:6] == num[7:10]) else 0 for num in df['phone_number']]
    
    #middle 4 number all same
    df['middle_4_sim'] = [1 if (allCharactersSame(num[4:8]) and num[3]!=num[4] and num[7]!=num[8]) else 0 for num in df['phone_number']]
    
    #birthday number like '1345'
    df['birthday_num'] = [1 if (num[3:5] == '13') or (num[6:8] == '13') else 0 for num in df['phone_number']]
    
    #middle 3 number all same
    def check_middle_3_condition(num):
        if ((allCharactersSame(num[6:9]) and num[5]!=num[6] and num[8]!=num[9]) or 
            (allCharactersSame(num[5:8]) and num[4]!=num[5] and num[7]!=num[8]) or
                (allCharactersSame(num[4:7]) and num[3]!=num[4] and num[6]!=num[7])):
            return True
        return False
    df['middle_3_sim'] = [1 if check_middle_3_condition(num) else 0 for num in df['phone_number']]
    
    #repeat 2 digits
    pattern_repeat_2_digits = r'([0-9])\1{1}'
    df['repeat_2_digits'] = [1 if len(re.findall(pattern_repeat_2_digits , num[3:]))>=2 else 0 for num in df['phone_number']]
    
    #first 10000
    df['first_10000'] = [1 if ((num[3]!='0' and num[8]!='0') and ((num[4:7]=='000') or (num[5:8]=='000'))) else 0 for num in df['phone_number']]
    
    #last 10000
    df['last_10000'] = [1 if ((num[6:9]=='000' and num[5]!='0') or (num[7:10]=='000')) else 0 for num in df['phone_number']]
    
    #million
    df['million'] = [1 if (num[3]!='0' and ((num[4:9]=='00000') or (num[5:10]=='00000'))) else 0 for num in df['phone_number']]



In [59]:
number_features(df)

In [60]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,...,first_3_sim,area_code_repeat,wheighted_num,middle_4_sim,birthday_num,middle_3_sim,repeat_2_digits,first_10000,last_10000,million
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400988 entries, 0 to 623558
Data columns (total 48 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   phone_number      400988 non-null  object 
 1   price             400988 non-null  float64
 2   status            400988 non-null  int64  
 3   city_0            400988 non-null  int64  
 4   city_1            400988 non-null  int64  
 5   city_2            400988 non-null  int64  
 6   city_3            400988 non-null  int64  
 7   city_4            400988 non-null  int64  
 8   city_5            400988 non-null  int64  
 9   city_6            400988 non-null  int64  
 10  city_7            400988 non-null  int64  
 11  time              400988 non-null  int64  
 12  area_code_0       400988 non-null  int64  
 13  area_code_1       400988 non-null  int64  
 14  area_code_2       400988 non-null  int64  
 15  area_code_3       400988 non-null  int64  
 16  area_code_4       40

### 6.3 To let model learn complex patterns in phone numbers we split each phone number into two groups `first_three` and `last_four`

### 6.3.1 `first_three` columns

In [62]:
first_three_0 = df.phone_number.apply(lambda phone: int(phone[3]))
first_three_1 = df.phone_number.apply(lambda phone: int(phone[4]))
first_three_2 = df.phone_number.apply(lambda phone: int(phone[5]))
df['first_three_0'] = first_three_0
df['first_three_1'] = first_three_1
df['first_three_2'] = first_three_2

### 6.3.2 `last_four` columns

In [63]:
last_four_0 = df.phone_number.apply(lambda phone: int(phone[6]))
last_four_1 = df.phone_number.apply(lambda phone: int(phone[7]))
last_four_2 = df.phone_number.apply(lambda phone: int(phone[8]))
last_four_3 = df.phone_number.apply(lambda phone: int(phone[9]))
df['last_four_0'] = last_four_0
df['last_four_1'] = last_four_1
df['last_four_2'] = last_four_2
df['last_four_3'] = last_four_3

In [64]:
df.head()

Unnamed: 0,phone_number,price,status,city_0,city_1,city_2,city_3,city_4,city_5,city_6,...,first_10000,last_10000,million,first_three_0,first_three_1,first_three_2,last_four_0,last_four_1,last_four_2,last_four_3
0,9125943272,16600000.0,0,0,0,0,0,0,0,0,...,0,0,0,5,9,4,3,2,7,2
1,9123657259,17100000.0,0,0,0,0,0,0,0,0,...,0,0,0,3,6,5,7,2,5,9
2,9127962154,9700000.0,0,0,0,0,0,0,0,0,...,0,0,0,7,9,6,2,1,5,4
3,9120519173,7500000.0,0,0,0,0,0,0,0,0,...,0,0,0,0,5,1,9,1,7,3
4,9123505263,25000000.0,0,0,0,0,0,0,0,0,...,0,0,0,3,5,0,5,2,6,3


## 7. Save the Pre Processed Dataset

In [65]:
df.to_csv('../../DataSet/Pre processed/rond.ir_full_preprocessed.csv' , index=False)