# Cleaning unique user data

In [1]:
# importing necessary packages
import pandas as pd
import requests
from dotenv import load_dotenv
load_dotenv()
import os
import json
import flickrapi
import time
import numpy as np
import fuzzywuzzy
from fuzzywuzzy import fuzz, process



In [2]:
# read csv into df
df_user_data = pd.read_csv('data/unique_owner.csv')

  df_user_data = pd.read_csv('data/unique_owner.csv')


In [3]:
# check df
df_user_data.head(10)

Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,stat,person_id,person_nsid,...,person_has_free_educational_resources,person_timezone_label,person_timezone_offset,person_timezone_timezone_id,person_timezone_timezone,person_pro_badge,person_expire,person_mbox_sha1sum__content,code,message
0,0,0.0,0.0,0.0,0.0,0.0,0.0,ok,197843899@N04,197843899@N04,...,0.0,,,,,,,,,
1,1,1.0,1.0,1.0,1.0,1.0,1.0,ok,26745338@N06,26745338@N06,...,0.0,Brasilia,-03:00,America/Sao_Paulo,20.0,,,,,
2,2,2.0,2.0,2.0,2.0,2.0,2.0,ok,31331368@N00,31331368@N00,...,0.0,"Canberra, Melbourne, Sydney",+10:00,Australia/Canberra,68.0,standard,0.0,,,
3,3,3.0,3.0,3.0,3.0,3.0,3.0,ok,122687277@N03,122687277@N03,...,0.0,,,,,standard,0.0,,,
4,4,4.0,4.0,4.0,4.0,4.0,4.0,ok,196275842@N08,196275842@N08,...,0.0,,,,,,,,,
5,5,5.0,5.0,5.0,5.0,5.0,5.0,ok,44842144@N03,44842144@N03,...,0.0,,,,,standard,0.0,,,
6,6,6.0,6.0,6.0,6.0,6.0,6.0,ok,45008553@N04,45008553@N04,...,0.0,"Bangkok, Hanoi, Jakarta",+07:00,Asia/Bangkok,55.0,,,,,
7,7,7.0,7.0,7.0,7.0,7.0,7.0,ok,158117692@N02,158117692@N02,...,0.0,,,,,standard,1614069000.0,,,
8,8,8.0,8.0,8.0,8.0,8.0,8.0,ok,118071797@N03,118071797@N03,...,0.0,,,,,standard,0.0,,,
9,9,9.0,9.0,9.0,9.0,9.0,9.0,ok,169491036@N06,169491036@N06,...,0.0,,,,,,,,,


## Drop unnecessary columns

In [4]:
# check value count for is deleted
print(df_user_data.person_is_deleted.value_counts())
print(df_user_data.person_is_deleted.isna().sum())
# because there is only 1 value (False) and 24 NaN I decided to drop the column

0.0    38777
Name: person_is_deleted, dtype: int64
24


In [5]:
# list of columns to drop
columns_to_drop = ['Unnamed: 0.6','Unnamed: 0.5','Unnamed: 0.4', 'Unnamed: 0.3',
       'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0', 'stat','person_nsid','person_is_deleted',
       'person_iconserver','person_iconfarm','person_has_stats',
       'person_description__content',
       'person_photosurl__content', 'person_profileurl__content',
       'person_mobileurl__content','person_has_adfree', 'person_has_free_standard_shipping',
       'person_has_free_educational_resources','person_pro_badge','person_mbox_sha1sum__content','message','person_expire', 'code','person_timezone_timezone']

In [6]:
# check if its the right number
len(columns_to_drop)

26

In [7]:
# drop 26 columns
df_user_data_cleaned = df_user_data.drop(columns=columns_to_drop)

In [8]:
# check columns
df_user_data_cleaned.columns

Index(['person_id', 'person_ispro', 'person_path_alias',
       'person_username__content', 'person_realname__content',
       'person_location__content', 'person_photos_firstdatetaken__content',
       'person_photos_firstdate__content', 'person_photos_count__content',
       'person_timezone_label', 'person_timezone_offset',
       'person_timezone_timezone_id'],
      dtype='object')

## Alter column names

In [9]:
# replace 'person' with user and delete '_content'
df_user_data_cleaned.columns = df_user_data_cleaned.columns.str.replace('person','user').str.replace('__content','')


In [10]:
df_user_data_cleaned.columns

Index(['user_id', 'user_ispro', 'user_path_alias', 'user_username',
       'user_realname', 'user_location', 'user_photos_firstdatetaken',
       'user_photos_firstdate', 'user_photos_count', 'user_timezone_label',
       'user_timezone_offset', 'user_timezone_timezone_id'],
      dtype='object')

## delete NaN from user_location, bc it's our most important column

In [11]:
# clear NaN values from user_locations
df_user_data_cleaned = df_user_data_cleaned.dropna(subset='user_location')
df_user_data_cleaned = df_user_data_cleaned.reset_index(drop=True)

In [12]:
# check how much is still there
df_user_data_cleaned.shape

(14469, 12)

In [13]:
len(df_user_data)

38801

In [14]:
# check percentage of leftover data
x = (len(df_user_data_cleaned)/len(df_user_data))*100
print(x)

37.29027602381382


## alter dtypes

In [15]:
# check dtypes
df_user_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14469 entries, 0 to 14468
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   user_id                     14469 non-null  object 
 1   user_ispro                  14469 non-null  float64
 2   user_path_alias             9890 non-null   object 
 3   user_username               14469 non-null  object 
 4   user_realname               12591 non-null  object 
 5   user_location               14469 non-null  object 
 6   user_photos_firstdatetaken  14468 non-null  object 
 7   user_photos_firstdate       14468 non-null  float64
 8   user_photos_count           14469 non-null  float64
 9   user_timezone_label         13283 non-null  object 
 10  user_timezone_offset        13283 non-null  object 
 11  user_timezone_timezone_id   13283 non-null  object 
dtypes: float64(3), object(9)
memory usage: 1.3+ MB


In [16]:
# user_id to string
df_user_data_cleaned.user_id=df_user_data_cleaned.user_id.astype('string')

In [17]:
df_user_data_cleaned.user_id

0         26745338@N06
1         31331368@N00
2        122687277@N03
3         45008553@N04
4        158117692@N02
             ...      
14464    131418876@N03
14465     42905557@N00
14466     16939187@N06
14467     97454045@N07
14468     35939355@N04
Name: user_id, Length: 14469, dtype: string

In [18]:
# user_ispro to bool
df_user_data_cleaned.user_ispro=df_user_data_cleaned.user_ispro.astype('bool')

In [19]:
# user_location','user_realname','user_username','user_path_alias to string
df_user_data_cleaned[['user_location','user_realname','user_username','user_path_alias']] = df_user_data_cleaned[['user_location','user_realname','user_username','user_path_alias']].astype('string')

In [20]:
# user_photos_count to int
df_user_data_cleaned.user_photos_count=df_user_data_cleaned.user_photos_count.astype('int')

In [21]:
df_user_data_cleaned.user_photos_firstdate

0        1.212696e+09
1        1.099811e+09
2        1.397393e+09
3        1.259290e+09
4        1.515479e+09
             ...     
14464    1.424362e+09
14465    1.134207e+09
14466    1.197543e+09
14467    1.371249e+09
14468    1.251922e+09
Name: user_photos_firstdate, Length: 14469, dtype: float64

In [22]:
# user_firstdate unix to datetime
df_user_data_cleaned.user_photos_firstdate = pd.to_datetime(df_user_data_cleaned.user_photos_firstdate,unit='s',errors='coerce')

In [23]:
df_user_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14469 entries, 0 to 14468
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   user_id                     14469 non-null  string        
 1   user_ispro                  14469 non-null  bool          
 2   user_path_alias             9890 non-null   string        
 3   user_username               14469 non-null  string        
 4   user_realname               12591 non-null  string        
 5   user_location               14469 non-null  string        
 6   user_photos_firstdatetaken  14468 non-null  object        
 7   user_photos_firstdate       14468 non-null  datetime64[ns]
 8   user_photos_count           14469 non-null  int64         
 9   user_timezone_label         13283 non-null  object        
 10  user_timezone_offset        13283 non-null  object        
 11  user_timezone_timezone_id   13283 non-null  object    

In [24]:
# check type
type(df_user_data_cleaned.user_photos_firstdatetaken)

pandas.core.series.Series

In [25]:
# alter to string
df_user_data_cleaned.user_photos_firstdatetaken = df_user_data_cleaned.user_photos_firstdatetaken.astype('string')

In [26]:
df_user_data_cleaned.user_photos_firstdatetaken[9:15]

9     1840-01-01 00:00:00
10    2000-01-01 00:00:00
11    0000-00-00 00:00:00
12    0000-00-00 00:00:00
13    1980-01-01 00:00:07
14    1985-03-15 00:00:00
Name: user_photos_firstdatetaken, dtype: string

In [27]:
df_user_data_cleaned.user_photos_firstdatetaken = pd.to_datetime(df_user_data_cleaned.user_photos_firstdatetaken, dayfirst=True,format='%Y-%m-%d %H:%M:%S',errors='coerce')

In [28]:
# because coerce replaces error values with NaT 
# firstdatetaken is mysql datetime that starts with 1000-00-00 00:00:00 we "lost" 787 values
df_user_data_cleaned.user_photos_firstdatetaken.count()

13681

In [29]:
df_user_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14469 entries, 0 to 14468
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   user_id                     14469 non-null  string        
 1   user_ispro                  14469 non-null  bool          
 2   user_path_alias             9890 non-null   string        
 3   user_username               14469 non-null  string        
 4   user_realname               12591 non-null  string        
 5   user_location               14469 non-null  string        
 6   user_photos_firstdatetaken  13681 non-null  datetime64[ns]
 7   user_photos_firstdate       14468 non-null  datetime64[ns]
 8   user_photos_count           14469 non-null  int64         
 9   user_timezone_label         13283 non-null  object        
 10  user_timezone_offset        13283 non-null  object        
 11  user_timezone_timezone_id   13283 non-null  object    

In [30]:
df_user_data_cleaned[['user_timezone_offset','user_timezone_timezone_id','user_timezone_label']].sample(5)

Unnamed: 0,user_timezone_offset,user_timezone_timezone_id,user_timezone_label
12739,+00:00,Europe/London,"GMT: Dublin, Edinburgh, Lisbon, London"
5837,-08:00,PST8PDT,Pacific Time (US & Canada); Tijuana
9408,+00:00,Europe/London,"GMT: Dublin, Edinburgh, Lisbon, London"
12339,+10:00,Australia/Canberra,"Canberra, Melbourne, Sydney"
7837,+01:00,Europe/Amsterdam,"Amsterdam, Berlin, Bern, Rome, Stockholm, Vienna"


In [31]:
df_user_data_cleaned.user_timezone_offset.count()

13283

In [32]:
df_user_data_cleaned.user_timezone_timezone_id.count()

13283

In [33]:
df_user_data_cleaned[['user_timezone_offset','user_timezone_timezone_id','user_timezone_label']]= df_user_data_cleaned[['user_timezone_offset','user_timezone_timezone_id','user_timezone_label']].astype('string')

In [34]:
df_user_data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14469 entries, 0 to 14468
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   user_id                     14469 non-null  string        
 1   user_ispro                  14469 non-null  bool          
 2   user_path_alias             9890 non-null   string        
 3   user_username               14469 non-null  string        
 4   user_realname               12591 non-null  string        
 5   user_location               14469 non-null  string        
 6   user_photos_firstdatetaken  13681 non-null  datetime64[ns]
 7   user_photos_firstdate       14468 non-null  datetime64[ns]
 8   user_photos_count           14469 non-null  int64         
 9   user_timezone_label         13283 non-null  string        
 10  user_timezone_offset        13283 non-null  string        
 11  user_timezone_timezone_id   13283 non-null  string    

## check for duplicates  

In [35]:
df_user_data_cleaned.duplicated().value_counts()

False    14177
True       292
dtype: int64

In [36]:
# we find more duplicates for one column, otherwise it compares the row
df_user_data_cleaned.duplicated(subset='user_id').value_counts()

False    14172
True       297
dtype: int64

In [37]:
# look at duplicates
df_user_data_cleaned[df_user_data_cleaned.duplicated(keep=False)].sort_values(by='user_id')

Unnamed: 0,user_id,user_ispro,user_path_alias,user_username,user_realname,user_location,user_photos_firstdatetaken,user_photos_firstdate,user_photos_count,user_timezone_label,user_timezone_offset,user_timezone_timezone_id
1630,100175587@N02,True,,cbelato (sometimes out),Cidinha Belato,"Rio de Janeiro, Brasil",2012-06-30 15:51:49,2013-08-13 00:04:46,2230,Brasilia,-03:00,America/Sao_Paulo
1927,100175587@N02,True,,cbelato (sometimes out),Cidinha Belato,"Rio de Janeiro, Brasil",2012-06-30 15:51:49,2013-08-13 00:04:46,2230,Brasilia,-03:00,America/Sao_Paulo
1701,100175587@N02,True,,cbelato (sometimes out),Cidinha Belato,"Rio de Janeiro, Brasil",2012-06-30 15:51:49,2013-08-13 00:04:46,2230,Brasilia,-03:00,America/Sao_Paulo
1653,100193876@N06,True,juliek1967,Julie from Wexford,,"Wexford, Ireland",2004-11-04 12:43:48,2013-08-12 11:10:59,3396,"GMT: Dublin, Edinburgh, Lisbon, London",+00:00,Europe/London
1724,100193876@N06,True,juliek1967,Julie from Wexford,,"Wexford, Ireland",2004-11-04 12:43:48,2013-08-12 11:10:59,3396,"GMT: Dublin, Edinburgh, Lisbon, London",+00:00,Europe/London
...,...,...,...,...,...,...,...,...,...,...,...,...
1780,98309604@N00,True,kleepet,kleepet,Karen Peterson,"Jersey City, NJ, USA",2001-05-27 12:52:05,2005-08-17 06:37:30,6097,Eastern Time (US & Canada),-05:00,EST5EDT
1893,98372297@N03,True,tiphainerolland,Tiphaine Rolland,Tiphaine Rolland,"Toronto, Canada",2010-08-01 00:00:00,2013-07-05 22:03:00,19104,"GMT: Dublin, Edinburgh, Lisbon, London",+00:00,Europe/London
2119,98372297@N03,True,tiphainerolland,Tiphaine Rolland,Tiphaine Rolland,"Toronto, Canada",2010-08-01 00:00:00,2013-07-05 22:03:00,19104,"GMT: Dublin, Edinburgh, Lisbon, London",+00:00,Europe/London
1827,98406434@N00,True,anyhoo,Anyhoo,,UK,1998-07-01 00:00:00,2004-09-16 17:31:20,25403,"GMT: Dublin, Edinburgh, Lisbon, London",+00:00,Europe/London


## keep existing user id's in user_in_df for updates

In [38]:
# drop duplicates and store existing user ids to a list
user_in_df=[]
df_user_data_cleaned = df_user_data_cleaned.drop_duplicates(subset='user_id')

for i in df_user_data_cleaned.user_id.unique():
    user_in_df.append(i)

In [39]:
df_user_data_cleaned.shape

(14172, 12)

In [40]:
# check if everything was right
print(user_in_df)
print(len(user_in_df))
# check type
type(user_in_df)

['26745338@N06', '31331368@N00', '122687277@N03', '45008553@N04', '158117692@N02', '169491036@N06', '153546415@N07', '78379788@N06', '115521748@N08', '44340545@N05', '45676495@N05', '96660892@N07', '196626126@N08', '45629904@N03', '49143546@N06', '165034204@N08', '64586261@N02', '197734992@N06', '94635768@N04', '197493901@N06', '90250231@N05', '17671746@N04', '79152742@N05', '18478048@N00', '126912597@N07', '21225329@N08', '149623494@N02', '149960743@N05', '75710752@N04', '197381218@N08', '31833404@N02', '97105070@N00', '185616361@N04', '46017193@N00', '21924357@N04', '30707075@N08', '25538536@N02', '8920035@N02', '15086926@N08', '191736856@N08', '155729724@N07', '153638711@N04', '38835423@N02', '191493143@N05', '187113622@N05', '26581186@N05', '199098213@N02', '98142888@N03', '29742055@N00', '97697738@N00', '194619535@N06', '155127994@N04', '58273812@N08', '135840124@N04', '98336874@N04', '60047372@N00', '198281398@N05', '10167052@N06', '7702423@N04', '8755091@N07', '8836196@N06', '12

list

In [41]:
df_user_data_cleaned.sample(5)

Unnamed: 0,user_id,user_ispro,user_path_alias,user_username,user_realname,user_location,user_photos_firstdatetaken,user_photos_firstdate,user_photos_count,user_timezone_label,user_timezone_offset,user_timezone_timezone_id
10531,92986809@N05,True,kurzik,George Kurzik,George Kurzik,"Hellam, Pennsylvania, USA",2012-06-03 12:03:01,2013-02-10 07:27:41,3986,Eastern Time (US & Canada),-05:00,EST5EDT
8421,45143032@N05,False,nick0las_alpha,Nick0las.Alpha,Sergey,"Kharkov, Ukraine",2007-09-01 10:27:41,2009-12-01 18:41:28,186,"Helsinki, Kyiv, Riga, Sofia, Tallinn, Vilnius",+02:00,Europe/Helsinki
5686,193293022@N08,False,,moholoanet,Moholoane Teboho,Vereeniging,2020-03-05 10:28:17,2021-06-13 02:44:13,266,,,
12186,191254924@N02,False,allenpaulphotos,Allen-Paul Photography & Artwork,Allen-Paul,UK,1988-01-01 00:00:00,2020-12-10 12:23:19,523,"GMT: Dublin, Edinburgh, Lisbon, London",+00:00,Europe/London
12017,29831066@N00,True,vini00,vini00,Vinicius Moreira,"Hammond, USA",2000-01-01 00:01:15,2007-01-18 03:32:45,992,Central Time (US & Canada),-06:00,CST6CDT


## split user location into useful columns

In [42]:
# check format of user_location
df_user_data_cleaned.user_location

0                    Melbourne, Australia
1                    Melbourne, Australia
2                               Melbourne
3        Phnom Penh, Kingdom of Cambodia 
4                    Melbourne, Australia
                       ...               
14464            Alcalá de Henares, Spain
14465                       Padova, Italy
14466                    Cagliari, Italia
14467                     Szeged, Hungary
14468                       Dinan, France
Name: user_location, Length: 14172, dtype: string

In [43]:
## conclusion timestamp: as I figured out flickr is creating timezone columns with arguments of location, if they fit in at least on 'city'and/or 'country' format
# check how many values for timezone_offset
df_user_data_cleaned.user_timezone_offset.isna().sum()

1177

In [44]:
# try to split user_location
df_split = df_user_data_cleaned.user_location.str.split(', ', expand=True)
df_split.sample(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
4673,London,United Kingdom,,,,,,,,,...,,,,,,,,,,
6633,Louisiana,USA,,,,,,,,,...,,,,,,,,,,


In [45]:
# check whats going on with 21 columns
df_split.sample(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
9338,Olomouc,Europe,,,,,,,,,...,,,,,,,,,,
25,Karlovac,Croatia,,,,,,,,,...,,,,,,,,,,
1231,Taipei,Taiwan,,,,,,,,,...,,,,,,,,,,
6442,United Arab Emirates,,,,,,,,,,...,,,,,,,,,,
8246,Santa Barbara,US,,,,,,,,,...,,,,,,,,,,
7407,Drachten,Nederland,,,,,,,,,...,,,,,,,,,,
6198,Alençon,France,,,,,,,,,...,,,,,,,,,,
14382,Deutschland,,,,,,,,,,...,,,,,,,,,,
12116,Essen,Deutschland,,,,,,,,,...,,,,,,,,,,
11890,Aiken,South Carolina,USA,,,,,,,,...,,,,,,,,,,


In [46]:
# check type of df_split
type(df_split)

pandas.core.frame.DataFrame

In [47]:
# note there is a wrong TZ
df_user_data_cleaned.loc[7152]

user_id                                                    91292806@N00
user_ispro                                                        False
user_path_alias                                              buferanera
user_username                           Giovanni Picuti, già Buferanera
user_realname                                           Giovanni Picuti
user_location                 Foligno, Italy (giovanni.picuti@alice.it)
user_photos_firstdatetaken                          1970-12-01 00:00:00
user_photos_firstdate                               2006-11-09 22:04:04
user_photos_count                                                  2213
user_timezone_label                        International Date Line West
user_timezone_offset                                             -12:00
user_timezone_timezone_id                                    Etc/GMT+12
Name: 7152, dtype: object

In [48]:
df_temp = df_user_data_cleaned.user_location.str.split(', ',expand=True)

In [49]:
df_temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,Melbourne,Australia,,,,,,,,,...,,,,,,,,,,
1,Melbourne,Australia,,,,,,,,,...,,,,,,,,,,
2,Melbourne,,,,,,,,,,...,,,,,,,,,,
3,Phnom Penh,Kingdom of Cambodia,,,,,,,,,...,,,,,,,,,,
4,Melbourne,Australia,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14464,Alcalá de Henares,Spain,,,,,,,,,...,,,,,,,,,,
14465,Padova,Italy,,,,,,,,,...,,,,,,,,,,
14466,Cagliari,Italia,,,,,,,,,...,,,,,,,,,,
14467,Szeged,Hungary,,,,,,,,,...,,,,,,,,,,


In [50]:
df_temp.columns

RangeIndex(start=0, stop=22, step=1)

In [51]:
# "rename" columns from range to names
df_temp.columns=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21']

In [52]:
# merge new location columns to dataframe
df_user_data_cleaned=pd.merge(df_user_data_cleaned,df_temp,left_index=True, right_index=True)

In [53]:
df_user_data_cleaned.head()

Unnamed: 0,user_id,user_ispro,user_path_alias,user_username,user_realname,user_location,user_photos_firstdatetaken,user_photos_firstdate,user_photos_count,user_timezone_label,...,12,13,14,15,16,17,18,19,20,21
0,26745338@N06,False,danisonksen,dani.sonksen,Daniela Sönksen,"Melbourne, Australia",2005-05-09 10:56:54,2008-06-05 20:03:04,850,Brasilia,...,,,,,,,,,,
1,31331368@N00,True,lonelyradio,lonely radio,andrew j. cosgriff,"Melbourne, Australia",2000-02-01 00:00:00,2004-11-07 06:59:09,13381,"Canberra, Melbourne, Sydney",...,,,,,,,,,,
2,122687277@N03,True,philipmallis,philip.mallis,Philip Mallis,Melbourne,2007-09-21 04:37:37,2014-04-13 12:48:29,10793,,...,,,,,,,,,,
3,45008553@N04,False,reaksmeyyean,REAKSMEY Yean – GEORGE,REAKSMEY Yean – GEORGE,"Phnom Penh, Kingdom of Cambodia",2005-11-04 20:32:27,2009-11-27 02:43:31,236,"Bangkok, Hanoi, Jakarta",...,,,,,,,,,,
4,158117692@N02,True,,Julie McLennan,Julie McLennan,"Melbourne, Australia",2018-01-08 22:23:51,2018-01-09 06:23:51,2533,,...,,,,,,,,,,


In [54]:
# only the first 3 columns seem to have reliable data, so I dropped the rest
df_user_data_cleaned = df_user_data_cleaned.drop(columns=['3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21'])

In [55]:
df_user_data_cleaned.columns

Index(['user_id', 'user_ispro', 'user_path_alias', 'user_username',
       'user_realname', 'user_location', 'user_photos_firstdatetaken',
       'user_photos_firstdate', 'user_photos_count', 'user_timezone_label',
       'user_timezone_offset', 'user_timezone_timezone_id', '0', '1', '2'],
      dtype='object')

In [56]:
df_user_data_cleaned.shape

(14172, 15)

In [57]:
df_user_data_cleaned.head()

Unnamed: 0,user_id,user_ispro,user_path_alias,user_username,user_realname,user_location,user_photos_firstdatetaken,user_photos_firstdate,user_photos_count,user_timezone_label,user_timezone_offset,user_timezone_timezone_id,0,1,2
0,26745338@N06,False,danisonksen,dani.sonksen,Daniela Sönksen,"Melbourne, Australia",2005-05-09 10:56:54,2008-06-05 20:03:04,850,Brasilia,-03:00,America/Sao_Paulo,Melbourne,Australia,
1,31331368@N00,True,lonelyradio,lonely radio,andrew j. cosgriff,"Melbourne, Australia",2000-02-01 00:00:00,2004-11-07 06:59:09,13381,"Canberra, Melbourne, Sydney",+10:00,Australia/Canberra,Melbourne,Australia,
2,122687277@N03,True,philipmallis,philip.mallis,Philip Mallis,Melbourne,2007-09-21 04:37:37,2014-04-13 12:48:29,10793,,,,Melbourne,,
3,45008553@N04,False,reaksmeyyean,REAKSMEY Yean – GEORGE,REAKSMEY Yean – GEORGE,"Phnom Penh, Kingdom of Cambodia",2005-11-04 20:32:27,2009-11-27 02:43:31,236,"Bangkok, Hanoi, Jakarta",+07:00,Asia/Bangkok,Phnom Penh,Kingdom of Cambodia,
4,158117692@N02,True,,Julie McLennan,Julie McLennan,"Melbourne, Australia",2018-01-08 22:23:51,2018-01-09 06:23:51,2533,,,,Melbourne,Australia,


In [58]:
# check if there is a timezone generated with only usa
df_user_data_cleaned.loc[6316]

user_id                                             197323723@N06
user_ispro                                                  False
user_path_alias                                              <NA>
user_username                                    portauthursmiles
user_realname                                  Port authur Smiles
user_location                                       united states
user_photos_firstdatetaken                    2022-12-31 01:20:20
user_photos_firstdate                         2022-12-31 09:20:46
user_photos_count                                              11
user_timezone_label           Pacific Time (US & Canada); Tijuana
user_timezone_offset                                       -08:00
user_timezone_timezone_id                                 PST8PDT
0                                                   united states
1                                                            <NA>
2                                                            <NA>
Name: 6316

## figuring out what happens in the location column

In [59]:
# change dt to list
# split without expand=true
df_split = df_user_data_cleaned.user_location.str.split(', ')
df_split.tolist()

[['Melbourne', 'Australia'],
 ['Melbourne', 'Australia'],
 ['Melbourne'],
 ['Phnom Penh', 'Kingdom of Cambodia '],
 ['Melbourne', 'Australia'],
 ['Barishal', 'Bangladesh'],
 ['Melbourne', 'Australia'],
 ['Australia'],
 ['Singapore', 'Singapore'],
 ['Manchester', 'Greater Manchester'],
 ['Colton', 'England'],
 ['Edinburgh', 'Scotland'],
 ['West Palm Beach', 'United States'],
 ['Brasília', 'Brasil'],
 ['Brasilia', 'Brazil'],
 ['Brasil'],
 ['Brasília', 'Brasil'],
 ['Brasília (DF)', 'Brasil'],
 ['Berlin', 'Deutschland'],
 ['California', 'united states'],
 ['Berlin  Rabat Paris'],
 ['Malakoff', 'France'],
 ['Bath', 'England'],
 ['Hallein', 'Austria'],
 ['Poland'],
 ['Karlovac', 'Croatia'],
 ['Poznań', 'Poland'],
 ['Austria'],
 ['Burgos', 'España'],
 ['Bratislava', 'Slovakia'],
 ['Žilina', 'Slovakia'],
 ['Zagreb', 'Croatia'],
 ['Greenville', 'NC', 'United States'],
 ['Devon', 'UK'],
 ['Oslo', 'Norway'],
 ['United States of America'],
 ['Illogan', 'United Kingdom'],
 ['Hull', 'England'],
 ['M

### decided to only keep the first 3 columns as they fit the format

In [60]:
three_list=[]
for x in df_split:
    if len(x)<=3:
        three_list.append(x)
print(len(three_list))

14088


In [61]:
rest_list=[]
for x in df_split:
    if len(x)>=4:
        rest_list.append(x)
print(len(rest_list))

84


In [62]:
len(df_user_data_cleaned.user_location)

14172

In [63]:
# 94% follow the format
(len(rest_list)/len(three_list))*100

0.596252129471891

## getting new df's country,city, state

In [64]:
df_countries = pd.read_csv('data/countries_cap.csv',sep=';')

In [65]:
df_countries.head()

Unnamed: 0,name,alpha-2,alpha-3,country-code,iso_3166-2,region,sub-region,intermediate-region,region-code,sub-region-code,intermediate-region-code
0,Afghanistan,AF,AFG,4,ISO 3166-2:AF,Asia,Southern Asia,,142.0,34.0,
1,Åland Islands,AX,ALA,248,ISO 3166-2:AX,Europe,Northern Europe,,150.0,154.0,
2,Albania,AL,ALB,8,ISO 3166-2:AL,Europe,Southern Europe,,150.0,39.0,
3,Algeria,DZ,DZA,12,ISO 3166-2:DZ,Africa,Northern Africa,,2.0,15.0,
4,American Samoa,AS,ASM,16,ISO 3166-2:AS,Oceania,Polynesia,,9.0,61.0,


In [66]:
df_countries = df_countries.drop(columns= ['alpha-3', 'country-code', 'iso_3166-2', 'region',
       'sub-region', 'intermediate-region', 'region-code', 'sub-region-code',
       'intermediate-region-code'])

In [67]:
df_countries.columns

Index(['name', 'alpha-2'], dtype='object')

In [68]:
df_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     249 non-null    object
 1   alpha-2  248 non-null    object
dtypes: object(2)
memory usage: 4.0+ KB


In [69]:
df_countries = df_countries[['name','alpha-2']].astype('string')

In [70]:
dict = {'name': 'country_name',
        'alpha-2': 'country_code'}

In [71]:
df_countries.rename(columns=dict,
          inplace=True)

In [72]:
df_countries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   country_name  249 non-null    string
 1   country_code  248 non-null    string
dtypes: string(2)
memory usage: 4.0 KB


In [73]:
df_countries['country_name_2'] = df_countries.country_name

In [74]:
df_countries.head(10)

Unnamed: 0,country_name,country_code,country_name_2
0,Afghanistan,AF,Afghanistan
1,Åland Islands,AX,Åland Islands
2,Albania,AL,Albania
3,Algeria,DZ,Algeria
4,American Samoa,AS,American Samoa
5,Andorra,AD,Andorra
6,Angola,AO,Angola
7,Anguilla,AI,Anguilla
8,Antarctica,AQ,Antarctica
9,Antigua and Barbuda,AG,Antigua and Barbuda


In [75]:
countries_dict_co_na = df_countries.set_index('country_code')['country_name'].to_dict()
countries_dict_na_na = df_countries.set_index('country_name')['country_name_2'].to_dict()

In [76]:
countries_dict_na_na

{'Afghanistan': 'Afghanistan',
 'Åland Islands': 'Åland Islands',
 'Albania': 'Albania',
 'Algeria': 'Algeria',
 'American Samoa': 'American Samoa',
 'Andorra': 'Andorra',
 'Angola': 'Angola',
 'Anguilla': 'Anguilla',
 'Antarctica': 'Antarctica',
 'Antigua and Barbuda': 'Antigua and Barbuda',
 'Argentina': 'Argentina',
 'Armenia': 'Armenia',
 'Aruba': 'Aruba',
 'Australia': 'Australia',
 'Austria': 'Austria',
 'Azerbaijan': 'Azerbaijan',
 'Bahamas': 'Bahamas',
 'Bahrain': 'Bahrain',
 'Bangladesh': 'Bangladesh',
 'Barbados': 'Barbados',
 'Belarus': 'Belarus',
 'Belgium': 'Belgium',
 'Belize': 'Belize',
 'Benin': 'Benin',
 'Bermuda': 'Bermuda',
 'Bhutan': 'Bhutan',
 'Bolivia (Plurinational State of)': 'Bolivia (Plurinational State of)',
 'Bonaire, Sint Eustatius and Saba': 'Bonaire, Sint Eustatius and Saba',
 'Bosnia and Herzegovina': 'Bosnia and Herzegovina',
 'Botswana': 'Botswana',
 'Bouvet Island': 'Bouvet Island',
 'Brazil': 'Brazil',
 'British Indian Ocean Territory': 'British Indi

In [77]:
type(countries_dict_co_na)

dict

In [78]:
df_user_data_cleaned['0'] = df_user_data_cleaned['0'].replace(np.nan,'0')
df_user_data_cleaned['1'] = df_user_data_cleaned['1'].replace(np.nan,'0')
df_user_data_cleaned['2'] = df_user_data_cleaned['2'].replace(np.nan,'0')

In [79]:
# df_user_data_cleaned['user_country'] = df_user_data_cleaned['0'].map(countries_dict_co_na)
# df_user_data_cleaned['user_country'] = df_user_data_cleaned['1'].map(countries_dict_co_na)
# df_user_data_cleaned['user_country'] = df_user_data_cleaned['2'].map(countries_dict_co_na)

In [80]:
# df_user_data_cleaned['user_country'] = df_user_data_cleaned['0'].map(countries_dict_na_na)
# df_user_data_cleaned['user_country2'] = df_user_data_cleaned['1'].map(countries_dict_na_na)
# df_user_data_cleaned['user_country3'] = df_user_data_cleaned['2'].map(countries_dict_na_na)

In [81]:
df_user_data_cleaned.sample(5)

Unnamed: 0,user_id,user_ispro,user_path_alias,user_username,user_realname,user_location,user_photos_firstdatetaken,user_photos_firstdate,user_photos_count,user_timezone_label,user_timezone_offset,user_timezone_timezone_id,0,1,2
7206,198675203@N07,False,,isthebestautotest,Zukunft_ist_jetzt Zukunft_ist_jetzt,Stuttgart Mitte,NaT,2023-07-04 23:40:39,686,,,,Stuttgart Mitte,0,0
5223,55216637@N04,True,ilwphotos,Ian L Winter,Ian Winter,"St. John's, Canada",2002-06-01 11:28:54,2010-10-28 12:35:48,10020,Newfoundland,-03:30,Canada/Newfoundland,St. John's,Canada,0
12585,166486267@N06,False,,STORMTROOPER123.....IRELAND...,INDIGO WOLFSBANE,NORTHERN IRELAND....,2003-07-10 09:41:08,2018-08-28 17:44:43,282,"GMT: Dublin, Edinburgh, Lisbon, London",+00:00,Europe/London,NORTHERN IRELAND....,0,0
7706,95098446@N07,False,robyjy,DolciPocoDolci e SalatoSalute,Roberta Giovagnoli,"Fabrica di Roma, Italia",2007-01-01 12:09:18,2013-04-23 09:01:24,436,Pacific Time (US & Canada); Tijuana,-08:00,PST8PDT,Fabrica di Roma,Italia,0
2528,192791471@N02,True,wsmears,wsmears,Bill Mears,Raleigh NC,2007-07-28 12:04:48,2021-04-20 13:37:47,1487,Eastern Time (US & Canada),-05:00,EST5EDT,Raleigh NC,0,0


In [82]:
# def fuzzy_match_country(text, choices):
#     # Use fuzzywuzzy to find the best match
#     best_match, score = process.extractOne(text, choices, scorer=fuzz.ratio)
#     # Set a threshold for a minimum score to consider a match
#     threshold = 80  # You can adjust this threshold as needed
#     if score >= threshold:
#         return best_match
#     else:
#         return None

# # Clean up the 'country_code_name' column
# df_user_data_cleaned['user_country_4'] = df_user_data_cleaned['0'].apply(lambda x: fuzzy_match_country(x, countries_dict_na_na.keys()))

In [83]:
# df_user_data_cleaned['user_country_5'] = df_user_data_cleaned['1'].apply(lambda x: fuzzy_match_country(x, countries_dict_na_na.keys()))

In [84]:
df_countries[df_countries.iloc[:,0]== 'Hong Kong']

Unnamed: 0,country_name,country_code,country_name_2
100,Hong Kong,HK,Hong Kong


In [85]:
df_user_data_cleaned.sample(3)

Unnamed: 0,user_id,user_ispro,user_path_alias,user_username,user_realname,user_location,user_photos_firstdatetaken,user_photos_firstdate,user_photos_count,user_timezone_label,user_timezone_offset,user_timezone_timezone_id,0,1,2
13751,141689143@N07,False,bert-s_kijk_op_de_wereld,benBert47,Bert Verspuij,"Zwijndrecht, Nederland",1984-03-07 00:00:00,2016-06-13 15:07:25,296,"Amsterdam, Berlin, Bern, Rome, Stockholm, Vienna",+01:00,Europe/Amsterdam,Zwijndrecht,Nederland,0
4797,88876166@N00,True,tabor-roeder,Phil Roeder,Phil Roeder,"Des Moines, IA, USA",1953-09-01 00:00:00,2006-03-29 04:24:12,18681,Central Time (US & Canada),-06:00,CST6CDT,Des Moines,IA,USA
3330,111035648@N08,False,,Paolo Strologo Media,Paolo .Strologo,"Numana, Italia",2015-04-22 15:35:07,2016-03-11 12:44:33,23,"Amsterdam, Berlin, Bern, Rome, Stockholm, Vienna",+01:00,Europe/Amsterdam,Numana,Italia,0


In [86]:
df_world_countries = pd.read_csv('data/world.csv')

In [87]:
df_world_countries.head(2)

Unnamed: 0,id,alpha2,alpha3,ar,bg,cs,da,de,el,en,...,ro,ru,sk,sl,sr,sv,th,uk,zh,zh-tw
0,4,af,afg,أفغانستان,Афганистан,Afghánistán,Afghanistan,Afghanistan,Αφγανιστάν,Afghanistan,...,Afganistan,Афганистан,Afganistan,Afganistan,Avganistan,Afghanistan,อัฟกานิสถาน,Афганістан,阿富汗,阿富汗
1,248,ax,ala,جزر أولاند,Оландски острови,Alandy,Ålandsøerne,Åland,Ώλαντ,Åland Islands,...,Insulele Åland,Аландские острова,Alandy,Ålandski otoki,Olandska Ostrva,Åland,หมู่เกาะโอลันด์,Аландські острови,奥兰,奧蘭


In [88]:
df_world_countries = df_world_countries.drop(columns='id')

In [89]:
df_world_countries.columns

Index(['alpha2', 'alpha3', 'ar', 'bg', 'cs', 'da', 'de', 'el', 'en', 'eo',
       'es', 'et', 'eu', 'fi', 'fr', 'hr', 'hu', 'hy', 'it', 'ja', 'ko', 'lt',
       'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sr', 'sv', 'th', 'uk',
       'zh', 'zh-tw'],
      dtype='object')

In [90]:
l = df_world_countries.set_index('en').apply(lambda row: {row.name: row.values.tolist()}, axis=1).tolist()

In [91]:
l_dict = {
    k: list(v.values()) 
    for k, v 
    in df_world_countries.set_index('en').to_dict('index').items()
}

In [92]:
type(l_dict)

dict

In [93]:
type(l)

list

In [98]:
l_dict

{'Afghanistan': ['af',
  'afg',
  'أفغانستان',
  'Афганистан',
  'Afghánistán',
  'Afghanistan',
  'Afghanistan',
  'Αφγανιστάν',
  'Afganio',
  'Afganistán',
  'Afganistan',
  'Afganistan',
  'Afganistan',
  'Afghanistan',
  'Afganistan',
  'Afganisztán',
  'Աֆղանստան',
  'Afghanistan',
  'アフガニスタン',
  '아프가니스탄',
  'Afganistanas',
  'Afghanistan',
  'Afghanistan',
  'Afganistan',
  'Afeganistão',
  'Afganistan',
  'Афганистан',
  'Afganistan',
  'Afganistan',
  'Avganistan',
  'Afghanistan',
  'อัฟกานิสถาน',
  'Афганістан',
  '阿富汗',
  '阿富汗'],
 'Åland Islands': ['ax',
  'ala',
  'جزر أولاند',
  'Оландски острови',
  'Alandy',
  'Ålandsøerne',
  'Åland',
  'Ώλαντ',
  'Alando',
  'Åland',
  'Ahvenamaa',
  'Aland',
  'Ahvenanmaa',
  'Îles Åland',
  'Ålandski otoci',
  'Åland',
  'Ալանդյան կղզիներ',
  'Isole Åland',
  'オーランド諸島',
  '올란드 제도',
  'Alandų salos',
  'Åland',
  'Åland',
  'Wyspy Alandzkie',
  'Ilhas Åland',
  'Insulele Åland',
  'Аландские острова',
  'Alandy',
  'Ålandski otoki',


In [99]:
# df_user_data_cleaned['user_country'] = df_user_data_cleaned['0'].map(l_dict)
df_user_data_cleaned['user_country2'] = df_user_data_cleaned['1'].map(l_dict)
# df_user_data_cleaned['user_country3'] = df_user_data_cleaned['2'].map(l_dict)

In [None]:
for key, value in l_dict:
    if value == df_user_data_cleaned['1']:
        df_user_data_cleaned['1'].append(key)

In [100]:
df_user_data_cleaned.head()

Unnamed: 0,user_id,user_ispro,user_path_alias,user_username,user_realname,user_location,user_photos_firstdatetaken,user_photos_firstdate,user_photos_count,user_timezone_label,user_timezone_offset,user_timezone_timezone_id,0,1,2,user_country2
0,26745338@N06,False,danisonksen,dani.sonksen,Daniela Sönksen,"Melbourne, Australia",2005-05-09 10:56:54,2008-06-05 20:03:04,850,Brasilia,-03:00,America/Sao_Paulo,Melbourne,Australia,0,"[au, aus, أستراليا, Австралия, Austrálie, Aust..."
1,31331368@N00,True,lonelyradio,lonely radio,andrew j. cosgriff,"Melbourne, Australia",2000-02-01 00:00:00,2004-11-07 06:59:09,13381,"Canberra, Melbourne, Sydney",+10:00,Australia/Canberra,Melbourne,Australia,0,"[au, aus, أستراليا, Австралия, Austrálie, Aust..."
2,122687277@N03,True,philipmallis,philip.mallis,Philip Mallis,Melbourne,2007-09-21 04:37:37,2014-04-13 12:48:29,10793,,,,Melbourne,0,0,
3,45008553@N04,False,reaksmeyyean,REAKSMEY Yean – GEORGE,REAKSMEY Yean – GEORGE,"Phnom Penh, Kingdom of Cambodia",2005-11-04 20:32:27,2009-11-27 02:43:31,236,"Bangkok, Hanoi, Jakarta",+07:00,Asia/Bangkok,Phnom Penh,Kingdom of Cambodia,0,
4,158117692@N02,True,,Julie McLennan,Julie McLennan,"Melbourne, Australia",2018-01-08 22:23:51,2018-01-09 06:23:51,2533,,,,Melbourne,Australia,0,"[au, aus, أستراليا, Австралия, Austrálie, Aust..."


In [None]:
countries_list = df_countries.values.tolist()

In [None]:
type(countries_list)

In [None]:
countries_list

In [None]:
countries_list_flat = []

for x in countries_list:
    for y in x:
        countries_list_flat.append(y)

print(countries_list_flat)

In [None]:
df_user_data_cleaned.info()

In [None]:
# location_list_flat = []

# for x in three_list:
#     for y in x:
#         location_list_flat.append(y)

# print(location_list_flat)

In [None]:
# counter = 0
# for x,y in zip(location_list_flat,countries_list_flat):
#     if x == y:
#         counter += 1
#         print(counter)

In [None]:
# 

In [None]:
# iterate through city, state/provence, country list
for a in three_list:
    if a == countries_list:
        #df_user_data_cleaned[['user_country']].append(a)
        print('jo')
    # elif b == df_countries:
    #     df_user_data_cleaned[['user_country']].append(b)
    #     print('jo')
    # elif c == df_countries:
    #     df_user_data_cleaned[['user_country']].append(c)
    #     print('jo')
        
        

In [None]:
df_user_data_cleaned['user_country']