## Setup

In [1]:
import logging
import re

import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import boto3
import botocore

### Logging

In [2]:
logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', level=logging.DEBUG)
logging.getLogger("botocore").setLevel(logging.ERROR)
logging.getLogger("s3transfer").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.getLogger("boto3").setLevel(logging.ERROR)
logging.getLogger("asyncio").setLevel(logging.ERROR)
logging.getLogger("aiobotocore").setLevel(logging.ERROR)
logging.getLogger("s3fs").setLevel(logging.ERROR)


logger = logging.getLogger('s3')

### S3

In [3]:
# from: https://github.com/MSIA/2021-msia423/blob/main/aws-s3/s3.py

def parse_s3(s3path):
    regex = r"s3://([\w._-]+)/([\w./_-]+)"

    m = re.match(regex, s3path)
    s3bucket = m.group(1)
    s3path = m.group(2)

    return s3bucket, s3path

def upload_to_s3_pandas(local_path, s3path, sep=';'):

    df = pd.read_csv(local_path, sep=sep)

    try:
        df.to_csv(s3path, sep=sep)
    except botocore.exceptions.NoCredentialsError:
        logger.error('Please provide AWS credentials via AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env variables.')
    else:
        logger.info('Data uploaded from %s to %s', local_path, s3path)

def download_from_s3_pandas(local_path, s3path, sep=';'):

    try:
        df = pd.read_csv(s3path, sep=sep)
    except botocore.exceptions.NoCredentialsError:
        logger.error('Please provide AWS credentials via AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY env variables.')
    else:
        df.to_csv(local_path, sep=sep)
        logger.info('Data uploaded from %s to %s', local_path, s3path)
        

In [4]:
local = True

## Load Data

In [5]:
if local:
    df = pd.read_csv('~/Desktop/Spring2021/avc/Project_External_Prepare/pokemon.csv')
else:
    df = pd.read_csv('s3://2021-msia423-wenyang-pan/raw/pokemon.csv')

## Explore Data

In [6]:
df.head(2)

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0


In [7]:
df.columns

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')

## Clean Data

In [8]:
select_columns = ['hp', 'attack', 'defense', 'sp_attack', 'sp_defense']

In [9]:
df_input = df[select_columns]

In [10]:
std_scale = StandardScaler()
df_scale = pd.DataFrame(std_scale.fit_transform(df_input), columns=df_input.columns)

In [11]:
mod_kmeans = KMeans(n_clusters=8)
_ = mod_kmeans.fit(df_scale)

In [12]:
df['labels'] = mod_kmeans.labels_

In [13]:
user_input = 'Bulbasaur'

In [14]:
target_label = df.query("name==@user_input")['labels'].item()

2021-04-27 09:26:17,889 numexpr.utils INFO     Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2021-04-27 09:26:17,890 numexpr.utils INFO     NumExpr defaulting to 8 threads.


In [15]:
df_result =  df.query('labels == @target_label & name !=@user_input')[['name', 'type1']].reset_index(drop=True)
df_result

Unnamed: 0,name,type1
0,Charmander,fire
1,Caterpie,bug
2,Metapod,bug
3,Weedle,bug
4,Kakuna,bug
...,...,...
155,Salandit,poison
156,Bounsweet,grass
157,Steenee,grass
158,Wimpod,bug


In [16]:
def get_url(name): 
    return f'https://pokemondb.net/pokedex/{name}'

In [17]:
df_result['link'] = df_result['name'].apply(get_url)

In [18]:
print(df_result.head(5).to_markdown())

|    | name       | type1   | link                                     |
|---:|:-----------|:--------|:-----------------------------------------|
|  0 | Charmander | fire    | https://pokemondb.net/pokedex/Charmander |
|  1 | Caterpie   | bug     | https://pokemondb.net/pokedex/Caterpie   |
|  2 | Metapod    | bug     | https://pokemondb.net/pokedex/Metapod    |
|  3 | Weedle     | bug     | https://pokemondb.net/pokedex/Weedle     |
|  4 | Kakuna     | bug     | https://pokemondb.net/pokedex/Kakuna     |
