## Hello Git
учимся по этому ролику:
https://youtu.be/zZBiln_2FhM

текст: https://vk.com/@vladilen.minin-git-and-github

In [4]:
# it's important note from GitHub: заход по паролю и юзернейму скоро работать не будет.
# https://github.blog/2020-12-15-token-authentication-requirements-for-git-operations/

# Step 1. Let's scrab the dataframe from Wiki

#### source: https://en.wikipedia.org/wiki/Demographics_of_Toronto_neighbourhoods

In [286]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

URL = 'https://en.wikipedia.org/wiki/Demographics_of_Toronto_neighbourhoods'
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser') 

# there are 5 tables on the wiki's page, so we need to use find_all command
table_all = soup.find_all('table',{'class':'wikitable sortable'})

# we want to scrab the 5th table ("Scarborough")
table = table_all[4].tbody 

rows = table.find_all('tr')
column_names = [v.text.replace('\n','') for v in rows[0].find_all('th')]

df = pd.DataFrame(columns=column_names)
df.head()

Unnamed: 0,Name,FM,Census Tracts,Population,Land area (km2),Density (people/km2),% Change in Population since 2001,Average Income,Transit Commuting %,% Renters,Second most common language (after English) by name,Second most common language (after English) by percentage,Map


In [287]:
# Now let's fill table by data
for i in range(1,len(rows)):
    tds = rows[i].find_all('td')
    if len(tds)==4:
        values = [tds[0].text, tds[1].text, tds[2].text, tds[3].text.replace('\n',''.replace('\xa0',''))]
    else:
        values = [td.text.replace('\n',''.replace('\xa0','')) for td in tds]
    df = df.append(pd.Series(values, index=column_names), ignore_index=True)

df

Unnamed: 0,Name,FM,Census Tracts,Population,Land area (km2),Density (people/km2),% Change in Population since 2001,Average Income,Transit Commuting %,% Renters,Second most common language (after English) by name,Second most common language (after English) by percentage,Map
0,Toronto CMA Average,,All,5113149,5903.63,866,9.0,40704,10.6,11.4,,,
1,Agincourt,S,"0377.01, 0377.02, 0377.03, 0377.04, 0378.02, 0...",44577,12.45,3580,4.6,25750,11.1,5.9,Cantonese (19.3%),19.3% Cantonese,
2,Alexandra Park,OCoT,0039.00,4355,0.32,13609,0.0,19687,13.8,28.0,Cantonese (17.9%),17.9% Cantonese,
3,Allenby,OCoT,0140.00,2513,0.58,4333,-1.0,245592,5.2,3.4,Russian (1.4%),01.4% Russian,
4,Amesbury,NY,"0280.00, 0281.01, 0281.02",17318,3.51,4934,1.1,27546,16.4,19.7,Spanish (6.1%),06.1% Spanish,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,Woburn,S,"0356.00, 0357.01, 0357.02, 0363.07, 0364.01, 0...",48507,13.34,3636,-1.5,26190,13.3,16.0,Gujarati (9.1%),09.1% Gujarati,
152,Wychwood,OCoT,0116.00,4182,0.68,6150,-2.0,53613,17.1,20.1,Portuguese (2.7%),02.7% Portuguese,
153,York Mills,NY,"0273.01, 0273.02, 0274.01, 0274.02",17564,7.29,2409,2.0,92099,10.0,11.8,Korean (4.0%),04.0% Korean,
154,York University Heights,NY,"0311.02, 0311.03, 0311.04, 0311.05, 0311.06",26140,13.21,1979,-1.2,24432,15.2,20.4,Italian (6.6%),06.6% Italian,


# 2. Let's improve our dataframe

In [344]:
# Remove four unuseful columns
df_1 = df.drop(labels=["FM", "Census Tracts", "Map", "Second most common language (after English) by percentage"], axis=1)

# Rename some columns to have better look
df_1.rename(columns={
    "Name": "Neighborhood", 
    "% Change in Population since 2001": "Change in Population since 2001 (%)",
    "Average Income": "Average Income (CAD)",
    "% Renters": "Renters (%)",
    "Second most common language (after English) by name": "Second language after English"}, 
    inplace=True)
df_1.head()

Unnamed: 0,Neighborhood,Population,Land area (km2),Density (people/km2),Change in Population since 2001 (%),Average Income (CAD),Transit Commuting %,Renters (%),Second language after English
0,Toronto CMA Average,5113149,5903.63,866,9.0,40704,10.6,11.4,
1,Agincourt,44577,12.45,3580,4.6,25750,11.1,5.9,Cantonese (19.3%)
2,Alexandra Park,4355,0.32,13609,0.0,19687,13.8,28.0,Cantonese (17.9%)
3,Allenby,2513,0.58,4333,-1.0,245592,5.2,3.4,Russian (1.4%)
4,Amesbury,17318,3.51,4934,1.1,27546,16.4,19.7,Spanish (6.1%)


In [345]:
# Does we have some empty rows? Let's check it:
(df_1['Neighborhood'].values == '').sum()

4

#### Yes, we have 4 empty rows. They should be deleted.

In [346]:
# Create the List with indexes of empty rows
Neighborhood = df_1["Neighborhood"]
list_of_empty_rows = []
for i in range(1, df_1.shape[0]):
    if Neighborhood[i] == '':
        list_of_empty_rows.append(i)
list_of_empty_rows

[65, 68, 95, 96]

In [347]:
df_1.drop(labels=list_of_empty_rows, inplace=True)
df_1.drop(labels=0, inplace=True)                    # Also remove first row with average information
df_1.reset_index(drop=True, inplace=True)
df_1.shape

(151, 9)

In [292]:
# Let's chek the data types 
df_1.dtypes

Neighborhood                           object
Population                             object
Land area (km2)                        object
Density (people/km2)                   object
Change in Population since 2001 (%)    object
Average Income (CAD)                   object
Transit Commuting %                    object
Renters (%)                            object
Second language after English          object
dtype: object

#### All datas were parced as an objects. As object type datas can't be processed, we need to fixed it.

In [348]:
# Let's try to use "convert_dtype" command:
df_1 = df_1.convert_dtypes()
df_1.dtypes

Neighborhood                           string
Population                             string
Land area (km2)                        string
Density (people/km2)                   string
Change in Population since 2001 (%)    string
Average Income (CAD)                   string
Transit Commuting %                    string
Renters (%)                            string
Second language after English          string
dtype: object

#### Now we have string type for all cells. Let's transfer strings to float for columns with numerical datas:

In [349]:
# Step_1. Replace all commas to dots:
df_1["Population"] = df_1["Population"].str.replace(',','.')
df_1["Density (people/km2)"] = df_1["Density (people/km2)"].str.replace(',','.')
df_1["Average Income (CAD)"] = df_1["Average Income (CAD)"].str.replace(',','.')

# Step_2. Change all numeric datas to float type:
df_1["Population"] = df_1["Population"].astype(float)
df_1["Land area (km2)"] = df_1["Land area (km2)"].astype(float)
df_1["Density (people/km2)"] = df_1["Density (people/km2)"].astype(float)
df_1["Change in Population since 2001 (%)"] = df_1["Change in Population since 2001 (%)"].astype(float)
df_1["Average Income (CAD)"] = df_1["Average Income (CAD)"].astype(float)
# df_1["Transit Commuting %"] = df_1["Transit Commuting %"].astype(float)  - Very strange column... Let's ignore it
df_1["Renters (%)"] = df_1["Renters (%)"].astype(float)
df_1.dtypes

Neighborhood                            string
Population                             float64
Land area (km2)                        float64
Density (people/km2)                   float64
Change in Population since 2001 (%)    float64
Average Income (CAD)                   float64
Transit Commuting %                     string
Renters (%)                            float64
Second language after English           string
dtype: object

#### The last improvement - edit all Neighborhood names to provide Geolocator correct work:

In [374]:
df_1.shape

(151, 9)

In [365]:
# Step_2. Deleting the Neighborhood names unsupported by Geolocator:
df_2 = df_1.drop(labels=[34, 65, 101, 105, 123])
df_2.reset_index(drop=True, inplace=True)
df_2.shape

(146, 9)

#### Now we have all dataframe ready to be analyzed.

# 3. Let's get geografical coordinates of Neighborhoods

#### получение геокоординат по адресам: https://vk.cc/bYl5a2

In [215]:
# !pip install selenium 
# from selenium import webdriver
# from tqdm import tqdm_notebook as tqdmn
# import time

import folium
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim        # convert an address into latitude and longitude values

Collecting package metadata (current_repodata.json): done
Solving environment: - 
  - anaconda/osx-64::ca-certificates-2020.10.14-0, anaconda/osx-64::openssl-1.1.1h-haf1e3a3_0
  - anaconda/osx-64::openssl-1.1.1h-haf1e3a3_0, defaults/osx-64::ca-certificates-2020.10.14-0
  - anaconda/osx-64::ca-certificates-2020.10.14-0, defaults/osx-64::openssl-1.1.1h-haf1e3a3_0
  - defaults/osx-64::ca-certificates-2020.10.14-0, defaults/osx-64::openssl-1.1.1h-haf1e3a3done

# All requested packages already installed.



In [367]:
n = df_2.shape[0]                    # number of Neighborhoods
Lat = pd.Series([0.0] * n)           # create empty Series for latitudes
Lon = pd.Series([0.0] * n)           # create empty Series for longitude
Neighborhood = df_2["Neighborhood"]  # list of Neighborhood names

In [372]:
# Get Neighborhood geocoordinares from Geolocator library. It will take a couple of minutes:
geolocator = Nominatim(user_agent="Scarborough_explorer")
for i in range(n):
    address = Neighborhood[i] + ', Toronto, Ontario'
    location = geolocator.geocode(address)
    Lat[i] = location.latitude
    Lon[i] = location.longitude  

In [373]:
df_2 = df_2.assign(Latitude=Lat.values) 
df_2 = df_2.assign(Longitude=Lon.values) 
df_2.head()

Unnamed: 0,Neighborhood,Population,Land area (km2),Density (people/km2),Change in Population since 2001 (%),Average Income (CAD),Transit Commuting %,Renters (%),Second language after English,Latitude,Longitude
0,Agincourt,44.577,12.45,3580.0,4.6,25.75,11.1,5.9,Cantonese (19.3%),43.785353,-79.278549
1,Alexandra Park,4.355,0.32,13.609,0.0,19.687,13.8,28.0,Cantonese (17.9%),43.650787,-79.404318
2,Allenby,2.513,0.58,4333.0,-1.0,245.592,5.2,3.4,Russian (1.4%),43.712849,-79.547065
3,Amesbury,17.318,3.51,4.934,1.1,27.546,16.4,19.7,Spanish (6.1%),43.706162,-79.483492
4,Armour Heights,4.384,2.29,1914.0,2.0,116.651,10.8,16.1,Russian (9.4%),43.743944,-79.430851


## DONE !!

Credentials from 
https://ru.foursquare.com/developers/apps/002XCF5MADOPDEBGLAMJW5GO2AV3C30U1RRF5RAE53V3Y3MW/settings

Client Id
002XCF5MADOPDEBGLAMJW5GO2AV3C30U1RRF5RAE53V3Y3MW

Client Secret
IMRERLDG5HFV1UOH1355WMBJIOTRYCSPV0JUH4XGAL5TOGSN

code=P311LZ2TWU1EQXRH0FRTNYRD1KIFDRQJ0AIIB2KTFXBCMM3K#_=_

https://foursquare.com/oauth2/access_token?client_id=002XCF5MADOPDEBGLAMJW5GO2AV3C30U1RRF5RAE53V3Y3MW&client_secret=IMRERLDG5HFV1UOH1355WMBJIOTRYCSPV0JUH4XGAL5TOGSN&grant_type=authorization_code&redirect_uri=https://www.google.com&code=P311LZ2TWU1EQXRH0FRTNYRD1KIFDRQJ0AIIB2KTFXBCMM3K#.

{"access_token":"LNHEI3W3F4EKYAHKLBR03QNLGFR1RAEJPBZHHKH5XVOU2JDP"}

### Here is a copy of single-loop of "venues/search" Foursqueare API request - LIMITED TO 50 MAX

In [None]:
# getting search center geocoordinates
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
lat_Toronto = location.latitude
lng_Toronto = location.longitude

# formalising API credentials
CLIENT_ID = '002XCF5MADOPDEBGLAMJW5GO2AV3C30U1RRF5RAE53V3Y3MW'
CLIENT_SECRET = 'IMRERLDG5HFV1UOH1355WMBJIOTRYCSPV0JUH4XGAL5TOGSN'
ACCESS_TOKEN = 'LNHEI3W3F4EKYAHKLBR03QNLGFR1RAEJPBZHHKH5XVOU2JDP'
VERSION = '20180604'
LIMIT = 50
search_query = ['Russian Restaurant']
radius = 25000
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&query={}&radius={}&limit={}'.format(
    CLIENT_ID, CLIENT_SECRET, lat_Toronto, lng_Toronto, ACCESS_TOKEN, VERSION, search_query, radius, LIMIT)

# making request
results = requests.get(url).json()

# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
dataframe = pd.json_normalize(venues)

# keep only columns that include venue name, address and location
filtered_columns = ['id', 'name', 'categories', 'location.lat', 'location.lng', 'location.address']
df_venues = dataframe.loc[:, filtered_columns]

# function extracts the category of the venue from the 'caregories' column
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
df_venues['categories'] = df_venues.apply(get_category_type, axis=1)

# Save Toronto venues result dataframe into csv file
df_venues.to_csv(r'/Users/borisyushenkov/Desktop/DS_ML_NN/IBM/Filename.csv')

# Сalculating the pi number using the Leibniz algorithm

### Pi = 4 - 4/3 + 4/5 - 4/7 + 4/9 - ...

1) What is interesting about this algorithm is that the number of zeros after one in the number n (where n is a number of iterations) determines the number of correctly counted significant digits in the number Pi. That is, entered n = 1,000,000 (six zeros) - we got the accuracy 3.14159ХХХХХХХХ (six significant numbers)

2) The calculation speed is approximately as follows: 10,000 iterations in 1 second (for a MacBookAir 2011 machine with a processor load of about 80%)

In [2]:
import time

n = int(input("Enter the iterations number... "))

Pi = 4
s = 1
if n == 0:                      # n should be positive integer
    print(Pi)
else:
    for i in range(n):
        m = (i+2)*2 - 1
        s = s * (-1)
        Pi = Pi + s*4/m
        print((i+1), end="\r")  # the end = "\ r" construct moves coursore to the beginning of the same line
time.sleep(2)                   # a timer is needed to see the number of iterations for small n
print(Pi)

Enter the iterations number...  100000


3.1416026534897203


In [None]:
# Введите количество итераций  1000000
# 3.1415936535887745