## Clean Country Data File

### Code by Hannah Ali

In [1]:
#import dependencies
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
#import the european restaurant data
file_name = "../country_data/countries of the world.csv"
import_csv = pd.read_csv(file_name)

In [3]:
#import Data into DF
countries_data = pd.DataFrame(import_csv)

In [4]:
#drop columns not used
countries_data_clean = countries_data.drop(['Region','Coastline (coast/area ratio)', 'Net migration', 'Infant mortality (per 1000 births)', 
                              'Literacy (%)', 'Phones (per 1000)', 'Crops (%)', 
                              'Other (%)', 'Climate', 'Birthrate','Deathrate', 'Agriculture', 
                              'Industry','Arable (%)'], axis=1)

In [5]:
# Change coloumn name from United Area (sq. mi.) to Area 

countries_data_clean.rename(columns={'Area (sq. mi.)': 'Area'}, inplace=True)

In [6]:
# Change coloumn name from United Pop. Density (per sq. mi.) to Pop_Density 

countries_data_clean.rename(columns={'Pop. Density (per sq. mi.)': 'Pop_Density'}, inplace=True)

In [7]:
# Change coloumn name from United GDP ($ per capita) to GDP 

countries_data_clean.rename(columns={'GDP ($ per capita)': 'GDP'}, inplace=True)

In [8]:
# To remove space at the end of Country name
j=0
for i in countries_data_clean.Country:
    if i[-1:]== ' ':
        countries_data_clean["Country"][j]=i[:-1]
    j=j+1

In [9]:
# Checking to see if space is removed
countries_data_clean["Country"][0]

'Afghanistan'

In [10]:
# # Change coloumn name from United Kingdom to England 
# countries_data_clean["Country"].replace('United Kingdom', 'England',  inplace=True)

In [11]:
#filter the dataset to only include The most visited countries 
countries_df = countries_data_clean.loc[(countries_data_clean['Country'] == "United Kingdom")| 
                                    (countries_data_clean['Country'] == "Ireland")|
                                   (countries_data_clean['Country'] == "France")|
                                    (countries_data_clean['Country'] == "Italy")]

In [12]:
countries_df.head()

Unnamed: 0,Country,Population,Area,Pop_Density,GDP,Service
69,France,60876136,547030,1113,27600.0,764
98,Ireland,4062235,70280,578,29600.0,49
101,Italy,58133509,301230,1930,26700.0,688
213,United Kingdom,60609153,244820,2476,27700.0,758


In [13]:
# Remove commas and change it to decimals
for j in range(0,len(countries_df.Pop_Density)):
    countries_df.iloc[j,3]=countries_df.iloc[j,3].replace(',','.')

In [14]:
countries_df['Service'] = countries_df.Service.str.replace(',','.')

In [15]:
countries_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 69 to 213
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      4 non-null      object 
 1   Population   4 non-null      int64  
 2   Area         4 non-null      int64  
 3   Pop_Density  4 non-null      object 
 4   GDP          4 non-null      float64
 5   Service      4 non-null      object 
dtypes: float64(1), int64(2), object(3)
memory usage: 224.0+ bytes


In [16]:
countries_df.Pop_Density=countries_df.Pop_Density.astype(float)
countries_df.Service=countries_df.Service.astype(float)

In [17]:
countries_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 69 to 213
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      4 non-null      object 
 1   Population   4 non-null      int64  
 2   Area         4 non-null      int64  
 3   Pop_Density  4 non-null      float64
 4   GDP          4 non-null      float64
 5   Service      4 non-null      float64
dtypes: float64(3), int64(2), object(1)
memory usage: 224.0+ bytes


In [18]:
#remove uppercase column letters
countries_df = countries_df.rename(columns={'Country': 'country', 
                                            'Population':'population',
                                            'Area' : 'area',
                                            'Pop_Density': 'pop_density',
                                            'GDP': 'gdp',
                                            'Service': 'service'                                           
                                           })

In [19]:
#export with no index
# countries_df.to_csv("../Resources/countries.csv", index=False)

### Push to Database


In [20]:
from sqlalchemy import create_engine
from getpass import getpass
password = getpass("Enter database password")


engine = create_engine(f'postgresql://postgres:{password}@europeanrestaurants.codkjybkqvuj.us-east-1.rds.amazonaws.com', 
                       echo=False)

Enter database password········


In [21]:
print(engine.table_names())

['restaurant_data']


In [22]:
con = engine.connect()

In [23]:
countries_df.to_sql('country_data', con=engine, if_exists='replace')

In [25]:
engine.execute("SELECT * FROM country_data").fetchall()

[(69, 'France', 60876136, 547030, 111.3, 27600.0, 0.764),
 (98, 'Ireland', 4062235, 70280, 57.8, 29600.0, 0.49),
 (101, 'Italy', 58133509, 301230, 193.0, 26700.0, 0.688),
 (213, 'United Kingdom', 60609153, 244820, 247.6, 27700.0, 0.758)]