In [1]:
import warnings
# To adjust seaborn settings for the plots.
import seaborn as sns
import matplotlib.pyplot as plt
# To run the Z score calculations if needed
from scipy import stats
import pandas_profiling                                  # Pandas Profiling v
import pandas as pd
import numpy as np
import folium
# To display values only upto four decimal places.
np.set_printoptions(precision=4)

# To suppress pandas warnings.
pd.set_option('mode.chained_assignment', None)
# To display all the data in each column
pd.set_option('display.max_colwidth', None)
# To display every column of the dataset in head()
pd.options.display.max_columns = 200
# To supress displaying large numbers in scientific format
pd.options.display.float_format = '{:.5f}'.format

# To apply seaborn whitegrid style to the plots.
plt.style.use('seaborn-whitegrid')
%matplotlib inline

sns.set(style='whitegrid',
        # Muted looks better for the charts!
        palette="muted",
        font_scale=1.3,
        color_codes=True)

# To suppress all the warnings in the notebook.
warnings.filterwarnings('ignore')

In [2]:
df1 = pd.read_csv("Capstone_Final_Data_Delhi.csv")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 751797 entries, 0 to 751796
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   event_id      751797 non-null  int64  
 1   device_id     751797 non-null  float64
 2   timestamp     751797 non-null  object 
 3   longitude     751797 non-null  float64
 4   latitude      751797 non-null  float64
 5   city          751797 non-null  object 
 6   state         751797 non-null  object 
 7   phone_brand   751797 non-null  object 
 8   device_model  751797 non-null  object 
 9   gender        751797 non-null  object 
 10  age           751797 non-null  float64
 11  group         751797 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 68.8+ MB


In [4]:
df1.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,city,state,phone_brand,device_model,gender,age,group
0,2765368,2.9733477869949143e+18,2016-05-07 22:52:05,77.22568,28.73014,Delhi,Delhi,Umidigi,UIMI3,M,35.0,M32-38
1,605968,-3.264499652692493e+18,2016-05-02 14:23:04,77.25681,28.75791,Delhi,Delhi,Xiaomi,MI pad,M,24.0,M23-26
2,2309828,3.020769016382465e+18,2016-05-04 09:24:50,77.26392,28.75791,Delhi,Delhi,Samsung,Galaxy Mega 5.8,F,53.0,F43+
3,2633127,-3.0901669552805755e+18,2016-05-05 04:42:59,77.28592,28.75838,Delhi,Delhi,Huawei,Honor 3C,M,33.0,M32-38
4,2224400,7.566051280927281e+18,2016-05-01 00:07:47,77.24048,28.71566,Delhi,Delhi,Huawei,Ascend G700T,F,34.0,F33-42


In [5]:
df1.nunique()

event_id        751797
device_id         4909
timestamp       412099
longitude         4795
latitude          4789
city                10
state                1
phone_brand         61
device_model       631
gender               2
age                 69
group               12
dtype: int64

In [6]:
df1.isna().sum()

event_id        0
device_id       0
timestamp       0
longitude       0
latitude        0
city            0
state           0
phone_brand     0
device_model    0
gender          0
age             0
group           0
dtype: int64

In [7]:
df1['phone_brand'].unique()

array(['Umidigi', 'Xiaomi', 'Samsung', 'Huawei', 'LeEco', 'VIVO', 'OPPO',
       'Meizu', 'Coolpad', 'ASUS', 'Nubia', 'HTC', 'Belfon', 'Nokia',
       'Hammer', 'Lenovo', 'LOGO', 'China Mobile', 'Bailifeng', 'Hēi mǐ',
       'Osoxin', 'OnePlus', 'Duowei', 'ZUK', 'Meitu', 'Yǔ xìn', 'Newmann',
       'Vitu', 'Hisense', 'LG', 'Xiǎo yáng shù', 'Aipel', 'QCong',
       'Tianyu', 'Qiku', 'Báimǐ', 'TCL', 'Taipower', 'Qing Cheng',
       'Konka', 'Zhizunbao', 'Wei Mi', 'Coolby', 'Plus', 'Guang Xin',
       'Ayuni', 'InFocus', 'Yitong', 'Motorola', 'Europa', 'Desai',
       'Yipai', 'Dà kělè', 'Mi Ge', 'Waveguide', 'Kiwish', 'Yougo',
       'Banghua', 'Weibi', 'Rubiks Cube', 'Ovo'], dtype=object)

In [8]:
df1['device_model'].unique()

array(['UIMI3', 'MI pad', 'Galaxy Mega 5.8', 'Honor 3C ', 'Ascend G700T',
       'Galaxy S5', 'Redmi 2 ', 'Ascend P7', 'Note Pro ', 'Superphone 1 ',
       'MI 3', 'X5Max+', 'R2017', 'Redmi Note3 ', 'Charm blue note ',
       'MX2', '8729', 'X5M', 'Galaxy A5', 'R1C', 'ZenFone 2', 'Find 7',
       'Redmi Note ', 'Galaxy Note 2', 'Z9 mini', 'Galaxy S6 Edge+',
       'Galaxy On7', 'G610S', 'One E8', 'Y37', 'Honor6 ', 'Galaxy S4',
       'Galaxy Grand 2', 'Redmi 1s ', 'MX3', '8721',
       'Honor 3X play version ', 'Redmi Note2 ', 'Mai Mang 3 ',
       'Galaxy S3', 'Honor 6 Plus ', 'U9508', 'F103', 'MI 4', 'R817',
       'Y11IT', 'X5Max', 'B199', 'Galaxy Note 3', 'Redmi ',
       'Daichi F1PLUS ', 'M5', 'BF T18', 'Y22', 'Mate 7', 'Charm blue 2 ',
       'Honor play 4C ', 'Honor 7 ', 'Honor play 4X ', 'S920', 'MI 2',
       'G520-T10', '9190l', 'S7I', 'XL', 'Superphone 1S ', '7295+',
       'Galaxy Note 4', 'T1', 'C8815', 'MI 1S', 'Ascend GX1', 'Ascend G6',
       'MX4 Pro', 'GN151', 'R7s',

In [9]:
df_test = df1.groupby(['device_id', 'longitude', 'latitude'])[
    'event_id'].nunique().to_frame().reset_index()
df_test.rename(columns={'event_id': 'data_count'}, inplace=True)
df_test

Unnamed: 0,device_id,longitude,latitude,data_count
0,-9222956879900151808.00000,77.28042,28.71864,65
1,-9221026417907252224.00000,77.24309,28.74644,147
2,-9218769147970108416.00000,77.24084,28.75193,20
3,-9218605091224095744.00000,77.02822,28.63578,12
4,-9209849644716288000.00000,77.28803,28.68541,76
...,...,...,...,...
4907,9203700427918537728.00000,77.26155,28.70455,6
4908,9204754392928356352.00000,77.28148,28.71456,71
4909,9210091097295259648.00000,77.27209,28.75612,56
4910,9216925254504448000.00000,77.23488,28.72913,109


In [10]:
df_test1 = df1.drop_duplicates('device_id')
df_test1.drop(df_test1.columns[[0,2,3,4]], axis = 1, inplace = True)
df_test1

Unnamed: 0,device_id,city,state,phone_brand,device_model,gender,age,group
0,2973347786994914304.00000,Delhi,Delhi,Umidigi,UIMI3,M,35.00000,M32-38
1,-3264499652692492800.00000,Delhi,Delhi,Xiaomi,MI pad,M,24.00000,M23-26
2,3020769016382465024.00000,Delhi,Delhi,Samsung,Galaxy Mega 5.8,F,53.00000,F43+
3,-3090166955280575488.00000,Delhi,Delhi,Huawei,Honor 3C,M,33.00000,M32-38
4,7566051280927281152.00000,Delhi,Delhi,Huawei,Ascend G700T,F,34.00000,F33-42
...,...,...,...,...,...,...,...,...
670627,6364206277211502592.00000,NangloiJat,Delhi,Huawei,Mate 7,M,25.00000,M23-26
724080,-1047213733085437440.00000,Deoli,Delhi,Meizu,MX4,M,31.00000,M29-31
734238,3657653996886348800.00000,DilliCantonment,Delhi,Xiaomi,E7,M,51.00000,M39+
738558,-3891168170303397888.00000,NangloiJat,Delhi,Coolpad,Arrow,M,25.00000,M23-26


In [11]:
df_map = pd.merge(df_test, df_test1, on='device_id', how='left')
df_map

Unnamed: 0,device_id,longitude,latitude,data_count,city,state,phone_brand,device_model,gender,age,group
0,-9222956879900151808.00000,77.28042,28.71864,65,Delhi,Delhi,Samsung,Galaxy Note 2,M,36.00000,M32-38
1,-9221026417907252224.00000,77.24309,28.74644,147,Delhi,Delhi,Xiaomi,MI 3,F,31.00000,F29-32
2,-9218769147970108416.00000,77.24084,28.75193,20,Delhi,Delhi,Xiaomi,Redmi Note,M,23.00000,M23-26
3,-9218605091224095744.00000,77.02822,28.63578,12,RoshanPura,Delhi,Samsung,Galaxy Note 3,M,33.00000,M32-38
4,-9209849644716288000.00000,77.28803,28.68541,76,Delhi,Delhi,Xiaomi,Redmi 2a,F,30.00000,F29-32
...,...,...,...,...,...,...,...,...,...,...,...
4907,9203700427918537728.00000,77.26155,28.70455,6,Delhi,Delhi,Huawei,Ascend G7,F,31.00000,F29-32
4908,9204754392928356352.00000,77.28148,28.71456,71,Delhi,Delhi,Meizu,Charm blue note 2,M,23.00000,M23-26
4909,9210091097295259648.00000,77.27209,28.75612,56,Delhi,Delhi,VIVO,X5Max+,M,40.00000,M39+
4910,9216925254504448000.00000,77.23488,28.72913,109,Delhi,Delhi,Samsung,A320t,M,41.00000,M39+


In [12]:
df_map.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4912 entries, 0 to 4911
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   device_id     4912 non-null   float64
 1   longitude     4912 non-null   float64
 2   latitude      4912 non-null   float64
 3   data_count    4912 non-null   int64  
 4   city          4912 non-null   object 
 5   state         4912 non-null   object 
 6   phone_brand   4912 non-null   object 
 7   device_model  4912 non-null   object 
 8   gender        4912 non-null   object 
 9   age           4912 non-null   float64
 10  group         4912 non-null   object 
dtypes: float64(4), int64(1), object(6)
memory usage: 460.5+ KB


In [13]:
delhi_map = folium.Map(location=[28.6519500, 77.2314900])
for lat, lon, di, dc, pb, dm in zip(df_map['latitude'], df_map['longitude'],
                                    df_map['device_id'], df_map['data_count'],
                                    df_map['phone_brand'], df_map['device_model']):
   
    #Below loop was used to change marker properties to tag makers with missing info
    
    #     if pd.notnull(pb) :
    #         clr = 'green'
    #         radius = 6
    #     else:
    #         clr = 'red'
    #         radius = 2

    folium.CircleMarker([lat, lon],
                        popup=('Device ID: ' + str(di)
                               + '<br>''Latitude: ' + str(lat)
                               + '<br>''Longitude: ' + str(lon)
                               + '<br>''Instances: ' + str(dc)
                               + '<br>''Brand: ' + str(pb)
                               + '<br>''Model: ' + str(dm)),
                        color='green',
                        threshold_scale=[0, 1, 2, 3],
                        radius=5,
                        fill=False,
                        fill_opacity=0.3).add_to(delhi_map)
delhi_map

In [14]:
df1[(df1.device_id == float(1.0572898355663908e+18))].longitude.unique()    #kabul

array([77.2242, 69.2075])

In [15]:
df1[(df1.device_id == float(1.0572898355663908e+18))].latitude.unique()    #kabul

array([28.7233, 34.5553])

In [16]:
df1[(df1.device_id == float(2.334568628287628e+18))].longitude.unique()    #Dubai

array([77.2953, 55.2708])

In [17]:
df1[(df1.device_id == float(2.334568628287628e+18))].latitude.unique()    #Dubai

array([28.7493, 25.2048])

In [18]:
df1[(df1.device_id == float(-1.448078833416778e+18))].longitude.unique()    #Rome

array([77.2485, 12.5674])

In [19]:
df1[(df1.device_id == float(-1.448078833416778e+18))].latitude.unique()    #Rome

array([28.7273, 41.8719])

In [20]:
x = df1[df1.device_id == float(1.0572898355663908e+18)].timestamp.unique()

In [21]:
x.shape

(1492,)