# Wine project pre-processing

In [1]:
import pandas as pd
import requests
import json
import numpy as np
import datetime
import time
import tweepy
import re

from datetime import datetime, timedelta

In [4]:
df = pd.read_csv('..\Data\winemag-data-130k-v2.csv', index_col=0)

## Extract information from title

In [None]:
titles = df[['title']].values.tolist()
titles = [item for sublist in titles for item in sublist]
titles

In [None]:
producer = []
years = []
vineyard_grape = []
region = []

for title in titles:
    #print(title)
    result = re.match(r'(\w.+) (\d*) ([^(]+) (\(.*\))', title)
    if result:
        #print(result.groups())
        producer.append(result.group(1))
        years.append(result.group(2))
        vineyard_grape.append(result.group(3))
        region.append(result.group(4)[1:-1])
    else:
        producer.append(0)
        years.append(0)
        vineyard_grape.append(0)
        region.append(0)

df['producer'] = producer
df['year'] = years
df['vineyard_grape'] = vineyard_grape
df['region_from_name'] = region

In [None]:
df

## Get number of followers of the rating person

In [None]:
df['taster_twitter_handle']

In [None]:
# Enter your Twitter API credentials
bearer_token = "<insert bearer token from twitter API>"

In [None]:
def getFollowerCount(handle):
    client = tweepy.Client(bearer_token)

    # Get the follower count of a Twitter handle
    user_ids = [handle]
    response = client.get_users(usernames=user_ids, user_fields=["public_metrics"])    

    if (response.data):
        for user in response.data:
            return user.public_metrics['followers_count']
    else:
        print('Not found: ' + handle)
        return 0

In [None]:
handles = df[['taster_twitter_handle']].values.tolist()
handles

In [None]:
dict = {}
followers = []

for handle in handles:
    value = handle[0]
    print(value)

    try:
        result = re.match(r'@(\w+)', value)
    except:
        print('error during parsing')
        followers.append(0)
        continue;

    if result:
        correct_handle = result.group(1)
        #print(correct_handle)
        if (correct_handle in dict):
            follower_count = dict[correct_handle]
            followers.append(follower_count)
        else:
            follower_count = getFollowerCount(correct_handle)
            dict[correct_handle] = follower_count
            followers.append(follower_count)

df['followers'] = followers

In [None]:
df

In [None]:
df.to_csv('..\Data\winemag-data-130k-v2+extract.csv', sep=';')

## Get weather information

Die klimatischen Minimalanforderungen für den Weinbau liegen bei einer mittleren Jahrestemperatur von mindestens 9 Grad Celsius, einer Durchschnittstemperatur des wärmsten Monats von mindestens 18 Grad Celsius, einer maximalen Tiefsttemperatur im Winter von minus 13 Grad Celsius, mindestens 1.300 Sonnenstunden pro Jahr, einer jährlichen Niederschlagsmenge von mindestens 500 mm (in warmem Klima mindestens 750 mm) sowie einer Vegetationsperiode (der Zeit zwischen dem letzten und dem ersten Frost) von mindestens 180 Tagen. Die optimale Temperatur für das Traubenwachstum beträgt zwischen 25 und 28 Grad Celsius. Die Temperatur hängt maßgeblich davon ab, in welcher Höhe die Reben stehen; als Faustregel gilt, dass die Temperatur pro 100 Meter Höhenunterschied aufwärts um 0,6 Grad sinkt.

In [None]:
apikey = '<insert openweather api key>'

In [None]:
df = df[['country', 'province', 'region_1']]
df = df[:50000] # max. 50.000 request / day on historic API

df.info()

In [None]:
def getGeo(url):
    response = requests.get(url)
    data_str = response.json()
    if (response.status_code != 200):
        print(data_str)
        return 0, 0
    elif (len(data_str) > 0):
        for entry in data_str:
            return entry['lat'], entry['lon']
    else:
        return 0, 0

In [None]:
def getTemp(url):
    response = requests.get(url)
    data_str = response.json()
    if (response.status_code != 200):
        print(data_str)
        return 0, 0, 0
    elif (len(data_str) > 0):
        return (data_str['result']['temp']['median'] - 273.15), (data_str['result']['humidity']['median']), (data_str['result']['wind']['median'])
    else:
        return 0, 0, 0

In [None]:
dict = {}
lats = []
lons = []

for index, row in df.iterrows():
    country = row['country']
    province = row['province']
    region = row['region_1']

    url = "http://pro.openweathermap.org/geo/1.0/direct?q="
    if (pd.isna(country) is False):
        url += str(country) + ","

    if (pd.isna(province) is False):
        url += str(province) + ","
    
    if (pd.isna(region) is False):
        url += str(region) + ","
        
    if (url[-1] == ','):
        url = url[:-2]

    url += "&limit=1&appid=" + apikey
    
    if (url in dict):
        lat, lon = dict[url]
        #print('got from dict:' + str(lat) + "," + str(lon))
    else:
        lat, lon = getGeo(url)
        dict[url] = (lat, lon)
        #print('got from url:' + str(lat) + "," + str(lon))
    
    lats.append(lat)
    lons.append(lon)

df['lat'] = lats
df['lon'] = lons

In [None]:
df.to_csv('..\Data\winemag-data-130k-v2+geo.csv', sep=';')

In [None]:
dict = {}
temps = []
humidities = []
winds = []

for index, row in df.iterrows():
    lat = row['lat']
    lon = row['lon']

    if (lat == 0 and lon == 0):
        temps.append(0)
        humidities.append(0)
        winds.append(0)
        continue

    date_time_start = datetime(2022, 1, 7, 13, 00)
    unix_time_start = time.mktime(date_time_start.timetuple())

    # with our subscription we can only get data from last year.
    url = "https://history.openweathermap.org/data/2.5/aggregated/day?lat="+str(round(lat, 2))+"&lon="+str(round(lon, 2))+"&month=7&day=1&units=metric&appid=" + apikey
    
    if (url in dict):
        temp, humidity, wind = dict[url]
        #print('got from dict:' + str(lat) + "," + str(lon))
    else:
        temp, humidity, wind = getTemp(url)
        dict[url] = (temp, humidity, wind)
        #print('got from url:' + str(lat) + "," + str(lon))
    
    temps.append(temp)
    humidities.append(humidity)
    winds.append(wind)

df['temp'] = temps
df['humidity'] = humidities
df['wind'] = winds

df

In [None]:
df.to_csv('..\Data\winemag-data-130k-v2+temp.csv', sep=';')

In [None]:
#df = df = pd.read_csv('..\Data\winemag-data-130k-v2+geo.csv', index_col=0, delimiter=';')
#df