# importing data

In [146]:
# Importing libraries
import pandas as pd
from sqlalchemy import create_engine

In [147]:
# Using sqlalchemy to connect with the file which contains the data 
engine = create_engine("sqlite:///../data/raw/CristopherRL.db")
#engine = create_engine('http://potacho.com/files/ironhack/CristopherRL.db')

In [148]:
# Creating a query to incorporate all 3 tables in one with all the information
query = """ 
SELECT 
personal_info.id, 
position, 
lastName, 
rank_info.name, 
age, 
personal_info."Unnamed: 0", 
gender, 
country, 
image, 
business_info.Source,
business_info.worth,
business_info.worthChange,
business_info.realTimeWorth
FROM personal_info
LEFT JOIN rank_info     ON personal_info.id = rank_info.id
LEFT JOIN business_info ON personal_info.id = business_info.id
ORDER BY position
;
"""

In [149]:
# Importing data from db file to dataframe
raw_data = pd.read_sql_query(query, engine)
raw_data.head(5)

Unnamed: 0.1,id,position,lastName,name,age,Unnamed: 0,gender,country,image,Source,worth,worthChange,realTimeWorth
0,8254,1.0,bEZOS,jefF BEZOS,54 years old,52,Male,,https://specials-images.forbesimg.com/imageser...,Technology ==> Amazon,112.0 BUSD,0.0 millions USD,
1,6688,2.0,gaTEs,bill GATES,62 years old,53,,,https://specials-images.forbesimg.com/imageser...,Technology ==> Microsoft,90.0 BUSD,-0.001 millions USD,
2,1904,3.0,bUFFETT,WARREN BUffett,87 years old,54,M,United States,https://specials-images.forbesimg.com/imageser...,Finance and Investments ==> Berkshire Hathaway,84.0 BUSD,-0.002 millions USD,
3,4168,4.0,aRNAULT,bernARD Arnault,69 years old,55,M,,https://specials-images.forbesimg.com/imageser...,Fashion & Retail ==> LVMH,72.0 BUSD,0.0 millions USD,
4,7545,5.0,zuckeRBERG,maRK ZUCKERBERG,1985,56,M,,https://specials-images.forbesimg.com/imageser...,Technology ==> Facebook,71.0 BUSD,0.0 millions USD,


# Wrangling data

In [150]:
import re

In [151]:
proc_data = raw_data

In [152]:
proc_data.shape

(2208, 13)

In [153]:
#Analyzing the type of every column
proc_data.dtypes

id                 int64
position         float64
lastName          object
name              object
age               object
Unnamed: 0         int64
gender            object
country           object
image             object
Source            object
worth             object
worthChange       object
realTimeWorth     object
dtype: object

In [154]:
proc_data.columns

Index(['id', 'position', 'lastName', 'name', 'age', 'Unnamed: 0', 'gender',
       'country', 'image', 'Source', 'worth', 'worthChange', 'realTimeWorth'],
      dtype='object')

In [155]:
null_cols = proc_data.isnull().sum()
null_cols[null_cols > 0]

age                65
gender             13
realTimeWorth    2208
dtype: int64

In [156]:
null_displ = proc_data[(proc_data['age'].isnull()==True)]
null_displ

Unnamed: 0.1,id,position,lastName,name,age,Unnamed: 0,gender,country,image,Source,worth,worthChange,realTimeWorth
54,1393,55.0,HINDUJA,hiNDUJA Family,,106,,,https://specials-images.forbesimg.com/imageser...,Diversified ==> diversified,19.5 BUSD,0.001 millions USD,
76,2609,77.0,KWOK,thomaS & RAYMOND kwok,,128,Male,Hong Kong,https://specials-images.forbesimg.com/imageser...,Real Estate ==> real estate,16.5 BUSD,0.0 millions USD,
89,6060,90.0,reUBEN,dAVID & SIMON REUBen,,141,M,,https://specials-images.forbesimg.com/imageser...,"Real Estate ==> investments, real estate",15.5 BUSD,0.0 millions USD,
129,7458,130.0,TSCHIRA,udo & HARALD TSCHIRA,,181,Male,,https://specials-images.forbesimg.com/imageser...,Technology ==> software,11.7 BUSD,0.0 millions USD,
141,3572,142.0,NG,ROBERT & PHIlip ng,,193,M,,https://specials-images.forbesimg.com/imageser...,Real Estate ==> real estate,10.8 BUSD,0.0 millions USD,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020,1160,2021.0,chIARAVANOND,yupa chiARavanond,,2072,F,,https://specials-images.forbesimg.com/imageser...,Diversified ==> diversified,1.1 BUSD,0.0 millions USD,
2021,2877,2022.0,CHIRAVAnond,PRATHIP CHIRAVanond,,2073,Male,,https://specials-images.forbesimg.com/imageser...,Diversified ==> diversified,1.1 BUSD,0.0 millions USD,
2028,3859,2029.0,EGGEr,wolFGANG EGGER,,2080,M,,https://specials-images.forbesimg.com/imageser...,Real Estate ==> real estate,1.1 BUSD,nan millions USD,
2044,6241,2045.0,kaPUr,madhu KApur,,2096,,,https://specials-images.forbesimg.com/imageser...,Finance and Investments ==> banking,1.1 BUSD,0.0 millions USD,


In [157]:
#Changing the data inside some columns

#Age
def real_age(x):
    
    if x is None:
        return 999 #to identify which doesnt have age informatio
    
    else:
        y = re.findall('[\d]+',x) # extracting y from "y years". The last on is '99 years'
        if int(y[0])<100:
            return int(y[0])
        else:
            return 2019-int(y[0]) #there are some people with year of birth
#applying function in every value and changing to int format
proc_data['n_age'] = proc_data['age'].apply(real_age).astype('int64')

In [158]:
#gender
proc_data['gender'] = proc_data['gender'].fillna("Unknown")
proc_data.loc[proc_data['gender']=='None', 'gender'] = "Unknown"
proc_data.loc[proc_data['gender']=='M', 'gender'] = "Male"
proc_data.loc[proc_data['gender']=='F', 'gender'] = "Female"

#proc_data.loc[(proc_data==None), 'gender'] = "Unknown"

In [159]:
#being sure that there are just 3 values
list(proc_data['gender'].unique())

['Male', 'Unknown', 'Female']

In [160]:
#last name
proc_data['LastName'] = proc_data['lastName'].str.upper()
#name
proc_data['Name'] = proc_data['name'].str.upper()

In [161]:
list(proc_data['country'].unique())

['None',
 'United States',
 'Hong Kong',
 'China',
 'Germany',
 'Brazil',
 'France',
 'Canada',
 'USA',
 'Japan',
 'India',
 'South Korea',
 'Indonesia',
 'United Kingdom',
 'Russia',
 'Nigeria',
 'Colombia',
 "People's Republic of China",
 'Mexico',
 'New Zealand',
 'Switzerland',
 'Italy',
 'Sweden',
 'Singapore',
 'South Africa',
 'UK',
 'Philippines',
 'Ukraine',
 'Israel',
 'Australia',
 'Denmark',
 'Portugal',
 'Chile',
 'Czech Republic',
 'Venezuela',
 'Vietnam',
 'Thailand',
 'United Arab Emirates',
 'Turkey',
 'Norway',
 'Malaysia',
 'Kazakhstan',
 'Taiwan',
 'Poland',
 'Netherlands',
 'Lebanon',
 'Argentina',
 'Spain',
 'Monaco',
 'Belgium',
 'Cyprus',
 'Peru',
 'Tanzania',
 'Austria',
 'Finland',
 'Oman',
 'Qatar',
 'Romania',
 'St. Kitts and Nevis',
 'Ireland']

In [None]:
#worth


In [162]:
#Changing every column to the correct type
proc_data['id']       = proc_data['id'].astype('object') 
proc_data['position'] = proc_data['position'].astype('int64') 

In [163]:
proc_data.head(10)

Unnamed: 0.1,id,position,lastName,name,age,Unnamed: 0,gender,country,image,Source,worth,worthChange,realTimeWorth,modif_age,LastName,Name
0,8254,1,bEZOS,jefF BEZOS,54 years old,52,Male,,https://specials-images.forbesimg.com/imageser...,Technology ==> Amazon,112.0 BUSD,0.0 millions USD,,54,BEZOS,JEFF BEZOS
1,6688,2,gaTEs,bill GATES,62 years old,53,Unknown,,https://specials-images.forbesimg.com/imageser...,Technology ==> Microsoft,90.0 BUSD,-0.001 millions USD,,62,GATES,BILL GATES
2,1904,3,bUFFETT,WARREN BUffett,87 years old,54,Male,United States,https://specials-images.forbesimg.com/imageser...,Finance and Investments ==> Berkshire Hathaway,84.0 BUSD,-0.002 millions USD,,87,BUFFETT,WARREN BUFFETT
3,4168,4,aRNAULT,bernARD Arnault,69 years old,55,Male,,https://specials-images.forbesimg.com/imageser...,Fashion & Retail ==> LVMH,72.0 BUSD,0.0 millions USD,,69,ARNAULT,BERNARD ARNAULT
4,7545,5,zuckeRBERG,maRK ZUCKERBERG,1985,56,Male,,https://specials-images.forbesimg.com/imageser...,Technology ==> Facebook,71.0 BUSD,0.0 millions USD,,34,ZUCKERBERG,MARK ZUCKERBERG
5,2790,6,ortEGA,AMANCIO ORTEGa,82 years old,57,Male,,https://specials-images.forbesimg.com/imageser...,Fashion & Retail ==> Zara,70.0 BUSD,0.0 millions USD,,82,ORTEGA,AMANCIO ORTEGA
6,8711,7,slim HELU,carLOS SLIM HELU,78 years old,58,Male,,https://specials-images.forbesimg.com/imageser...,Telecom ==> telecom,67.1 BUSD,0.001 millions USD,,78,SLIM HELU,CARLOS SLIM HELU
7,1066,8,koCH,chARLES Koch,82 years old,59,Male,,https://specials-images.forbesimg.com/imageser...,Diversified ==> Koch Industries,60.0 BUSD,0.0 millions USD,,82,KOCH,CHARLES KOCH
8,5843,9,kOCH,daVID KOCH,78 years old,60,Male,United States,https://specials-images.forbesimg.com/imageser...,Diversified ==> Koch Industries,60.0 BUSD,0.0 millions USD,,78,KOCH,DAVID KOCH
9,1718,10,ELLISON,laRRY ELLISon,73 years old,61,Unknown,United States,https://specials-images.forbesimg.com/imageser...,Technology ==> software,58.5 BUSD,-0.001 millions USD,,73,ELLISON,LARRY ELLISON
