In [1]:
import pandas as pd
import numpy as np

In [2]:
from sqlalchemy import create_engine

<h2> 1. Creating an engine to access the database

In [3]:
engine = create_engine('sqlite:///../data/raw/Albrodmar.db')

<h2>2. Opening all the tables on Pandas

In [4]:
df_business = pd.read_sql_query("SELECT * FROM business_info", engine)

In [5]:
df_personal = pd.read_sql_query("SELECT * FROM personal_info", engine)

In [6]:
df_rank = pd.read_sql_query('SELECT * FROM rank_info', engine)


<h4> All the df have a common column 'id' but df_rank has more lines thah the others df. Will 'outer' join the three base on 'id'


In [73]:
df_temp = pd.merge(df_business, df_personal, on = 'id', how = 'outer')
df_merged = pd.merge(df_temp, df_rank, on = 'id', how = 'outer')
df_merged.head(2)

Unnamed: 0.1,id,Unnamed: 0_x,Source,worth,worthChange,realTimeWorth,realTimePosition,lastName,age,Unnamed: 0_y,gender,country,image,name,position,Unnamed: 0
0,5390,52.0,Technology ==> Amazon,112.0 BUSD,0.0 millions USD,,1.0,bEZOS,54 years old,52.0,M,United States,https://specials-images.forbesimg.com/imageser...,jEFF BEzos,1.0,52
1,1675,53.0,Technology ==> Microsoft,90.0 BUSD,-0.001 millions USD,,2.0,GATEs,62 years old,53.0,,,https://specials-images.forbesimg.com/imageser...,BILL GAtes,2.0,53


<h4> Defining a function to drop columns

In [None]:
def dropcolumn(df,x):
    return (df.drop(columns = x, inplace = True))

<h4> Defining a function to drop rows

In [1]:
def droprow(df,x):
    return (df.drop[x])

<h2> 3. Start Data Wrangling

In [74]:
df_merged.isnull().sum()

id                     0
Unnamed: 0_x          52
Source                52
worth                 52
worthChange           52
realTimeWorth       2260
realTimePosition      52
lastName              52
age                  117
Unnamed: 0_y          52
gender                65
country               52
image                 52
name                   0
position              52
Unnamed: 0             0
dtype: int64

<h4>3.1 Drop column <b>realTimeWorth</b> because is full of nulls and column <b>lastName</b> because is useless having column <b>name</b>

In [75]:
dropcolumn(df_merged, 'realTimeWorth')

In [76]:
dropcolumn(df_merged, 'lastName')

<h4>3.2 Separete column source in column "Industry" and "Company"

In [11]:
df_merged[['industry','company_name']] = df_merged.Source.str.split(' ==>',expand = True)

#drop old column Source
dropcolumn(df_merged, 'Source')
df_merged.head(2)

Unnamed: 0.1,id,Unnamed: 0_x,worth,worthChange,realTimePosition,age,Unnamed: 0_y,gender,country,image,name,position,Unnamed: 0,industry,company_name
0,5390,52.0,112.0 BUSD,0.0 millions USD,1.0,54 years old,52.0,M,United States,https://specials-images.forbesimg.com/imageser...,jEFF BEzos,1.0,52,Technology,Amazon
1,1675,53.0,90.0 BUSD,-0.001 millions USD,2.0,62 years old,53.0,,,https://specials-images.forbesimg.com/imageser...,BILL GAtes,2.0,53,Technology,Microsoft


<h4>3.3 Capitalizing first letters only on column 'name'

In [12]:
df_merged['name'] = df_merged['name'].str.title()

In [13]:
df_merged.head(2)

Unnamed: 0.1,id,Unnamed: 0_x,worth,worthChange,realTimePosition,age,Unnamed: 0_y,gender,country,image,name,position,Unnamed: 0,industry,company_name
0,5390,52.0,112.0 BUSD,0.0 millions USD,1.0,54 years old,52.0,M,United States,https://specials-images.forbesimg.com/imageser...,Jeff Bezos,1.0,52,Technology,Amazon
1,1675,53.0,90.0 BUSD,-0.001 millions USD,2.0,62 years old,53.0,,,https://specials-images.forbesimg.com/imageser...,Bill Gates,2.0,53,Technology,Microsoft


<h4>3.4 Cleaning the 'age' column removing ' years old'

In [14]:
df_merged['age'] = df_merged.age.str.replace(' years old','')

<h4>3.5 Cleaning the 'worth' column removing 'BUSD'

In [15]:
df_merged['worth'] = df_merged.worth.str.replace('BUSD','')

<h4>3.6 Changing name of column <b>worth</b>

In [16]:
df_merged.rename(columns = {'worth':'worth in Billion USD'}, inplace = True)

<h4>3.7 Convert data in column 'worth in Billion USD' to float

In [17]:
df_merged['worth in Billion USD'] = df_merged['worth in Billion USD'].apply(pd.to_numeric)

<h4>3.8 Convert data in column 'age' to float

In [18]:
df_merged['age'] = df_merged['age'].apply(pd.to_numeric)

<h4>3.9 Cleaning the 'worthChange' column removing ' millions UDS'

In [19]:
df_merged['worthChange']=df_merged.worthChange.str.replace(' millions USD','')

<h4>3.10 Changing name of column 'worthChange'

In [20]:
df_merged.rename(columns = {'worthChange':'worth change in millions USD'}, inplace = True)

In [21]:
df_merged.head(2)

Unnamed: 0.1,id,Unnamed: 0_x,worth in Billion USD,worth change in millions USD,realTimePosition,age,Unnamed: 0_y,gender,country,image,name,position,Unnamed: 0,industry,company_name
0,5390,52.0,112.0,0.0,1.0,54.0,52.0,M,United States,https://specials-images.forbesimg.com/imageser...,Jeff Bezos,1.0,52,Technology,Amazon
1,1675,53.0,90.0,-0.001,2.0,62.0,53.0,,,https://specials-images.forbesimg.com/imageser...,Bill Gates,2.0,53,Technology,Microsoft


<h4>3.11 Changing 'ages' with a year of birth to years old 

In [22]:
df_merged['age'] = df_merged['age'].apply(lambda x: 2018 - x if x > 150 else x)

<h4>3.12 Changin 'gender' values to be only Male or Female

In [23]:
m_filter = df_merged['gender'] == 'M'
df_merged.loc[m_filter,'gender'] = df_merged.loc[m_filter,'gender'].replace('M','Male')

In [24]:
f_filter = df_merged['gender'] == 'F'
df_merged.loc[f_filter,'gender'] = df_merged.loc[f_filter,'gender'].replace('F','Female')

In [25]:
df_merged['gender'].value_counts()

Male      1467
None       549
Female     179
Name: gender, dtype: int64

In [63]:
def dropcolumn(df,x):
    return (df.drop(columns = x, inplace = True))

dropcolumn(df_merged, 'Unnamed: 0_y')

KeyError: "['Unnamed: 0_y'] not found in axis"

In [66]:
dropcolumn (df_merged,'Unnamed: 0_x')

df_merged

Unnamed: 0.1,id,worth in Billion USD,worth change in millions USD,realTimePosition,age,gender,country,image,name,position,Unnamed: 0,industry,company_name
0,5390,112.0,0.0,1.0,54.0,Male,United States,https://specials-images.forbesimg.com/imageser...,Jeff Bezos,1.0,52,Technology,Amazon
1,1675,90.0,-0.001,2.0,62.0,,,https://specials-images.forbesimg.com/imageser...,Bill Gates,2.0,53,Technology,Microsoft
2,2361,84.0,-0.002,3.0,87.0,,,https://specials-images.forbesimg.com/imageser...,Warren Buffett,3.0,54,Finance and Investments,Berkshire Hathaway
3,2340,72.0,0.0,4.0,69.0,,,https://specials-images.forbesimg.com/imageser...,Bernard Arnault,4.0,55,Fashion & Retail,LVMH
4,6891,71.0,0.0,5.0,33.0,,,https://specials-images.forbesimg.com/imageser...,Mark Zuckerberg,5.0,56,Technology,Facebook
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2255,3040,,,,,,,,Supaluck Umpujh & Family,,47,,
2256,8338,,,,,,,,Thomas Duff,,48,,
2257,7509,,,,,,,,Zakhar Smushkin,,49,,
2258,2942,,,,,,,,Zhang Zhongneng & Family,,50,,


In [31]:
df_merged['age'] = df_merged['age'].astype(int)

ValueError: Cannot convert non-finite values (NA or inf) to integer

In [186]:
df_merged[if_filter]

Unnamed: 0.1,id,Unnamed: 0_x,worth in Billion USD,worth change in millions USD,realTimePosition,age,Unnamed: 0_y,gender,country,image,name,position,Unnamed: 0,industry,company_name
26,1013,78.0,29.8,0.0,29.0,66.0,78.0,,,https://specials-images.forbesimg.com/imageser...,Beate Heister & Karl Albrecht Jr.,27.0,78,Fashion & Retail,supermarkets
76,2021,128.0,16.5,0.0,75.0,,128.0,,Hong Kong,https://specials-images.forbesimg.com/imageser...,Thomas & Raymond Kwok,77.0,128,Real Estate,real estate
89,6776,141.0,15.5,0.0,88.0,,141.0,,United Kingdom,https://specials-images.forbesimg.com/imageser...,David & Simon Reuben,90.0,141,Real Estate,"investments, real estate"
129,3611,181.0,11.7,0.0,110.0,,181.0,Male,,https://specials-images.forbesimg.com/imageser...,Udo & Harald Tschira,130.0,181,Technology,software
141,9220,193.0,10.8,0.0,143.0,,193.0,Male,,https://specials-images.forbesimg.com/imageser...,Robert & Philip Ng,142.0,193,Real Estate,real estate
