# Lede Summer 2019 Project - Part 9
### Transform dataframe to geojson! Clean up columns first, before transforming into geojson format

In [3]:
import requests
import pandas as pd
import numpy as np
import os
import json
import ast

pd.set_option('display.max_rows', None)

In [4]:
df = pd.read_csv('all_info_cleaned.csv')

## Narrow down df by dropping extraneous columns

In [3]:
df = df.drop(columns = ['first_name', 'last_name', 'code', 'NOC'])

## Clean the columns by replacing NaNs with empty strings'

* df['alternate_name'] 
* df['other_info'] 

In [4]:
df['alternate_name'] = df.alternate_name.fillna('')
df['other_info'] = df.other_info.fillna('')
df.head(3)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,other_info,season,years,country_name,latitude,longitude,article_results,hits
0,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,,Summer,1980-2012,Sweden,60.128161,18.643501,0,0
1,,ITA,Wheelchair fencing,Roberto MARSON,Paralympic,Men,3,16,7,26,,Summer,1964-1976,Italy,41.87194,12.56738,0,0
2,,GBR,Para swimming,Mike KENNY,Paralympic,Men,0,16,0,16,,Summer,1976-1988,United Kingdom,55.378051,-3.435973,"[{'headline': ""CADETS' DEDMOND TIES DASH MARK""...",1


## Write a function to turn long/lat columns into [latitude, longitude] format. Save it into a new column called geometry.coordinates


In [5]:
df.shape

(145, 18)

In [6]:
def get_coord(lat, long):
    coord = []
    coord.append(lat)
    coord.append(long)
    return coord

In [7]:
## Try out the function to see if it works
lat = df.latitude[0]
long = df.longitude[0]
get_coord(lat, long)

[60.128161, 18.643501]

In [8]:
lat_list = df.latitude.to_list()
long_list = df.longitude.to_list()
len(long_list)
length = len(lat_list)

In [9]:
coord_list = []
for idx in list(range(length)):
    lat = lat_list[idx]
    long = long_list[idx]
    coord = get_coord(lat, long)
    coord_list.append(coord)
len(coord_list)

145

In [10]:
df['geometry.coordinates'] = coord_list

### Clean up the df by removing longitude and latitude columns

In [11]:
df = df.drop(columns = ['latitude', 'longitude'])

In [12]:
df.shape

(145, 17)

## Transform df to geojson

I need to combine the information in different columns into a single column that can be transformed into 'article' for our geojson doc.  This is how we build the **"article:"** field of our geojson doc. You need to combine columns of data into readable text.

In [13]:
story = df.article_results[44]
story

"[{'headline': 'Ones to Know: Brian McKeever', 'lede': 'A look at some notable athletes who will take the stage at the Vancouver Games.', 'url': 'https://vancouver2010.blogs.nytimes.com/2010/02/13/ones-to-know-brian-mckeever/'}, {'headline': 'Losing His Sight, but Not His Dream of Making Olympics', 'lede': 'OTTAWA \\x97 When Brian McKeever made his debut with the Canadian team at this year’s world Nordic skiing championships in Sapporo, Japan, he was no stranger to high-level competition.', 'url': 'https://www.nytimes.com/2007/03/20/sports/othersports/20ski.html'}]"

## df.article_results is in STRING format! Turn it into actual dictionaries

In [14]:
# This gives you 'story' in list form

import ast
ast.literal_eval(story)

[{'headline': 'Ones to Know: Brian McKeever',
  'lede': 'A look at some notable athletes who will take the stage at the Vancouver Games.',
  'url': 'https://vancouver2010.blogs.nytimes.com/2010/02/13/ones-to-know-brian-mckeever/'},
 {'headline': 'Losing His Sight, but Not His Dream of Making Olympics',
  'lede': 'OTTAWA \x97 When Brian McKeever made his debut with the Canadian team at this year’s world Nordic skiing championships in Sapporo, Japan, he was no stranger to high-level competition.',
  'url': 'https://www.nytimes.com/2007/03/20/sports/othersports/20ski.html'}]

In [15]:
# Turn everything in df.article_results into dictionaries

results_dict = []

for result in df.article_results:
    result_dict = ast.literal_eval(result)
    results_dict.append(result_dict)
    
results_dict[44]

[{'headline': 'Ones to Know: Brian McKeever',
  'lede': 'A look at some notable athletes who will take the stage at the Vancouver Games.',
  'url': 'https://vancouver2010.blogs.nytimes.com/2010/02/13/ones-to-know-brian-mckeever/'},
 {'headline': 'Losing His Sight, but Not His Dream of Making Olympics',
  'lede': 'OTTAWA \x97 When Brian McKeever made his debut with the Canadian team at this year’s world Nordic skiing championships in Sapporo, Japan, he was no stranger to high-level competition.',
  'url': 'https://www.nytimes.com/2007/03/20/sports/othersports/20ski.html'}]

In [16]:
len(results_dict)

145

In [17]:
len(results_dict[44])

2

### Write a function, get_article_info(), which takes the dictionary of article searches and returns the headline and url for all articles written about the athlete in html form.
Every athlete has a list, and each list consists of a list of articles

In [18]:
def get_article_info(result_dict):
    num_articles = len(result_dict)
    outputs = []
    for idx in list(range(num_articles)):
        title = result_dict[idx]['headline']
        url = result_dict[idx]['url']
        output = f'<div id="story"><p><a href="{url}">{title}</a></p></div>'
        outputs.append(output)
    return outputs

In [19]:
# Try the function on one athlete

get_article_info(results_dict[44])

['<div id="story"><p><a href="https://vancouver2010.blogs.nytimes.com/2010/02/13/ones-to-know-brian-mckeever/">Ones to Know: Brian McKeever</a></p></div>',
 '<div id="story"><p><a href="https://www.nytimes.com/2007/03/20/sports/othersports/20ski.html">Losing His Sight, but Not His Dream of Making Olympics</a></p></div>']

In [20]:
article_html_list = []
for result in results_dict:
    if result == 0:
        article_html_list.append('')
    else:
        article = get_article_info(result)
        article_html_list.append(article)
    
article_html_list

['',
 '',
 ['<div id="story"><p><a href="https://www.nytimes.com/1970/01/18/archives/cadets-dedmond-ties-dash-mark-equals-meet-record-of-63army-wins-7.html">CADETS\' DEDMOND TIES DASH MARK</a></p></div>'],
 '',
 ['<div id="story"><p><a href="https://beijing2008.blogs.nytimes.com/2008/09/13/day-7-second-gold-for-pistorius-iran-forfeits-before-potential-game-vs-israel/">Day 7: Second Gold for Pistorius; Iran Forfeits Before Potential Game vs. Israel</a></p></div>',
  '<div id="story"><p><a href="https://www.nytimes.com/2008/05/15/sports/othersports/15george.html">A Blur of Hands, Spokes and Determination</a></p></div>',
  '<div id="story"><p><a href="https://olympics.blogs.nytimes.com/2008/09/13/blogs/13olympics-day-7-se613.html">Pistorius’s 2nd Gold on Paralympics Day 7</a></p></div>'],
 '',
 '',
 ['<div id="story"><p><a href="https://olympics.blogs.nytimes.com/2008/09/14/blogs/14olympics-a-fifth-251.html">A Fifth Gold for Du Toit and a Fourth for Petitclerc</a></p></div>',
  '<div id="

## Turn other columns into html so that I can later combine them into one column as properties.article

* full_name (NOTE: this will be in properties.article AND in properties.headline)
* alternate_name
* medals_total (NOTE: this will be in properties.article AND in properties.headline)
* country_name
* game_type
* season
* event
* years
* medals_gold, medals_silver, medals_bronze
* other_info
* hits

In [21]:
df.head(1)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,other_info,season,years,country_name,article_results,hits,geometry.coordinates
0,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,,Summer,1980-2012,Sweden,0,0,"[60.128161, 18.643501]"


In [22]:
html = []
for index, row in df.iterrows():
    full_name = '<h1>' + row.full_name + '</h1>'
    if row.alternate_name == '':
        alt_name = ''
    else:
        alt_name = f'<p> Also known as: {row.alternate_name}</p>'
    country = f'<p> Represented: {row.country_name}</p>'
    game_type = f'<p> Competed in the {row.season} {row.game_type}s from {row.years}</p>'
    event = f'<p> Event: {row.event}</p>'
    medals_total = f'<p>Total medals won: {row.medals_total}</p>'
    medals = f'<p> Gold: {row.medals_gold}, Silver: {row.medals_silver}, Bronze: {row.medals_bronze}</p>'
    if row.other_info != '':
        other_info = f'<p>Note: {row.other_info}</p>'
    else:
        other_info = ''
    hits = f'<p>Number of articles in The New York Times: {row.hits}</p>'
    articles = 'Top search results:'
    mystring = full_name + alt_name + country + game_type + \
               event + medals_total + medals + other_info + hits + articles
    html.append(mystring)
# html

In [23]:
df.head(3)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,other_info,season,years,country_name,article_results,hits,geometry.coordinates
0,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,,Summer,1980-2012,Sweden,0,0,"[60.128161, 18.643501]"
1,,ITA,Wheelchair fencing,Roberto MARSON,Paralympic,Men,3,16,7,26,,Summer,1964-1976,Italy,0,0,"[41.87194, 12.56738]"
2,,GBR,Para swimming,Mike KENNY,Paralympic,Men,0,16,0,16,,Summer,1976-1988,United Kingdom,"[{'headline': ""CADETS' DEDMOND TIES DASH MARK""...",1,"[55.378051, -3.435973]"


In [24]:
html

['<h1>Jonas JAKOBSSON</h1><p> Represented: Sweden</p><p> Competed in the Summer Paralympics from 1980-2012</p><p> Event: Para shooting</p><p>Total medals won: 27</p><p> Gold: 17, Silver: 2, Bronze: 8</p><p>Number of articles in The New York Times: 0</p>Top search results:',
 '<h1>Roberto MARSON</h1><p> Represented: Italy</p><p> Competed in the Summer Paralympics from 1964-1976</p><p> Event: Wheelchair fencing</p><p>Total medals won: 26</p><p> Gold: 16, Silver: 7, Bronze: 3</p><p>Number of articles in The New York Times: 0</p>Top search results:',
 '<h1>Mike KENNY</h1><p> Represented: United Kingdom</p><p> Competed in the Summer Paralympics from 1976-1988</p><p> Event: Para swimming</p><p>Total medals won: 16</p><p> Gold: 16, Silver: 0, Bronze: 0</p><p>Number of articles in The New York Times: 1</p>Top search results:',
 '<h1>Daniel DIAS</h1><p> Represented: Brazil</p><p> Competed in the Summer Paralympics from 2008-2016</p><p> Event: Para swimming</p><p>Total medals won: 24</p><p> Gold

In [25]:
properties_article = []
for idx, article_html in enumerate(article_html_list):
    html_complete = html[idx]
    if article_html == '':
        pass
    else:
        article_count = len(article_html)
        for idx in list(range(article_count)):
            html_complete += (article_html[idx])
    properties_article.append(html_complete)

# properties_article

In [26]:
df['properties.article'] = properties_article

## Add other columns that I need to turn the dataframe into geojson format

* geometry.type
* properties.name
* properties.headline
* properties.color
* properties.radius
* properties.group_id
* properties.group_name

In [27]:
df['geometry.type'] = 'Point'
df.head(2)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,other_info,season,years,country_name,article_results,hits,geometry.coordinates,properties.article,geometry.type
0,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,,Summer,1980-2012,Sweden,0,0,"[60.128161, 18.643501]",<h1>Jonas JAKOBSSON</h1><p> Represented: Swede...,Point
1,,ITA,Wheelchair fencing,Roberto MARSON,Paralympic,Men,3,16,7,26,,Summer,1964-1976,Italy,0,0,"[41.87194, 12.56738]",<h1>Roberto MARSON</h1><p> Represented: Italy<...,Point


In [28]:
df['properties.name'] = df.country_name
df.head(2)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,other_info,season,years,country_name,article_results,hits,geometry.coordinates,properties.article,geometry.type,properties.name
0,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,,Summer,1980-2012,Sweden,0,0,"[60.128161, 18.643501]",<h1>Jonas JAKOBSSON</h1><p> Represented: Swede...,Point,Sweden
1,,ITA,Wheelchair fencing,Roberto MARSON,Paralympic,Men,3,16,7,26,,Summer,1964-1976,Italy,0,0,"[41.87194, 12.56738]",<h1>Roberto MARSON</h1><p> Represented: Italy<...,Point,Italy


In [29]:
df['properties.headline'] = df.full_name
df.tail(2)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,...,season,years,country_name,article_results,hits,geometry.coordinates,properties.article,geometry.type,properties.name,properties.headline
143,,UKR,gymnastics,Polina ASTAKHOVA,Olympic,Women,3,5,2,10,...,Summer,1956-1964,Soviet Union,[{'headline': 'The Starting Line: Gymnastics t...,4,"[48.379433, 31.16558]",<h1>Polina ASTAKHOVA</h1><p> Represented: Sovi...,Point,Soviet Union,Polina ASTAKHOVA
144,,GER,equestrian,Isabell WERTH,Olympic,Women,0,6,4,10,...,Summer,2000-2016,Germany,[{'headline': 'Past Equestrian Scandals Lead t...,8,"[51.165691, 10.451526]",<h1>Isabell WERTH</h1><p> Represented: Germany...,Point,Germany,Isabell WERTH


### Assign colours to Paralympic and Olympic athletes.
* Paralympic = blue #33B5FF
* Olympic = yellow #FFE333

In [30]:
df['properties.color'] = '#33B5FF'
df.loc[80:, 'properties.color'] = '#FFE333'
df.head(1)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,...,years,country_name,article_results,hits,geometry.coordinates,properties.article,geometry.type,properties.name,properties.headline,properties.color
0,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,...,1980-2012,Sweden,0,0,"[60.128161, 18.643501]",<h1>Jonas JAKOBSSON</h1><p> Represented: Swede...,Point,Sweden,Jonas JAKOBSSON,#33B5FF


In [31]:
df.tail(2)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,...,years,country_name,article_results,hits,geometry.coordinates,properties.article,geometry.type,properties.name,properties.headline,properties.color
143,,UKR,gymnastics,Polina ASTAKHOVA,Olympic,Women,3,5,2,10,...,1956-1964,Soviet Union,[{'headline': 'The Starting Line: Gymnastics t...,4,"[48.379433, 31.16558]",<h1>Polina ASTAKHOVA</h1><p> Represented: Sovi...,Point,Soviet Union,Polina ASTAKHOVA,#FFE333
144,,GER,equestrian,Isabell WERTH,Olympic,Women,0,6,4,10,...,2000-2016,Germany,[{'headline': 'Past Equestrian Scandals Lead t...,8,"[51.165691, 10.451526]",<h1>Isabell WERTH</h1><p> Represented: Germany...,Point,Germany,Isabell WERTH,#FFE333


## Assign group_id and group_name to 'medal_count' or 'article_count'.

First, duplicate all rows.
Then, assign half of the rows group_name = 'medal_count' and group_id = 1.
Assign the other half group_name = 'article_count' and group_id = 2

* 'Number of medals won' = 1
* 'Number of NYT articles' = 2

In [32]:
''' Repeat without index '''
df_repeated = pd.concat([df]*2, ignore_index=True)
df_repeated.shape

(290, 22)

In [33]:
df_repeated['properties.group_id'] = 2
df_repeated['properties.group_name'] = 'Number of NYT articles'

In [34]:
df_repeated.loc[261]

alternate_name                                                            
citizenship                                                            USA
event                                                             aquatics
full_name                                                   Michael PHELPS
game_type                                                          Olympic
gender                                                                 Men
medals_bronze                                                            2
medals_gold                                                             23
medals_silver                                                            3
medals_total                                                            28
other_info                                                                
season                                                              Summer
years                                                            2004-2016
country_name             

### Double check where the duplicate rows start. Rows 0-144 are one frame; rows 145-the end are duplicates

In [35]:
df_repeated.loc[0] == df_repeated.loc[145]

alternate_name           True
citizenship              True
event                    True
full_name                True
game_type                True
gender                   True
medals_bronze            True
medals_gold              True
medals_silver            True
medals_total             True
other_info               True
season                   True
years                    True
country_name             True
article_results          True
hits                     True
geometry.coordinates     True
properties.article       True
geometry.type            True
properties.name          True
properties.headline      True
properties.color         True
properties.group_id      True
properties.group_name    True
dtype: bool

In [36]:
df_repeated.loc[:144, 'properties.group_id'] = 1
df_repeated.loc[:144, 'properties.group_name'] = 'Number of medals won'

df_repeated

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,...,article_results,hits,geometry.coordinates,properties.article,geometry.type,properties.name,properties.headline,properties.color,properties.group_id,properties.group_name
0,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,...,0,0,"[60.128161, 18.643501]",<h1>Jonas JAKOBSSON</h1><p> Represented: Swede...,Point,Sweden,Jonas JAKOBSSON,#33B5FF,1,Number of medals won
1,,ITA,Wheelchair fencing,Roberto MARSON,Paralympic,Men,3,16,7,26,...,0,0,"[41.87194, 12.56738]",<h1>Roberto MARSON</h1><p> Represented: Italy<...,Point,Italy,Roberto MARSON,#33B5FF,1,Number of medals won
2,,GBR,Para swimming,Mike KENNY,Paralympic,Men,0,16,0,16,...,"[{'headline': ""CADETS' DEDMOND TIES DASH MARK""...",1,"[55.378051, -3.435973]",<h1>Mike KENNY</h1><p> Represented: United Kin...,Point,United Kingdom,Mike KENNY,#33B5FF,1,Number of medals won
3,,BRA,Para swimming,Daniel DIAS,Paralympic,Men,3,14,7,24,...,0,0,"[-14.235004, -51.92528]",<h1>Daniel DIAS</h1><p> Represented: Brazil</p...,Point,Brazil,Daniel DIAS,#33B5FF,1,Number of medals won
4,,SUI,Para athletics,Heinz FREI,Paralympic,Men,6,14,6,26,...,[{'headline': 'Day 7: Second Gold for Pistoriu...,3,"[46.818188, 8.227511999999999]",<h1>Heinz FREI</h1><p> Represented: Switzerlan...,Point,Switzerland,Heinz FREI,#33B5FF,1,Number of medals won
5,,SUI,Para athletics | handcycling,Franz NIETLISPACH,Paralympic,Men,2,14,6,22,...,0,0,"[46.818188, 8.227511999999999]",<h1>Franz NIETLISPACH</h1><p> Represented: Swi...,Point,Switzerland,Franz NIETLISPACH,#33B5FF,1,Number of medals won
6,,CAN,Para swimming,Michael EDGSON,Paralympic,Men,0,14,2,16,...,0,0,"[56.130366, -106.346771]",<h1>Michael EDGSON</h1><p> Represented: Canada...,Point,Canada,Michael EDGSON,#33B5FF,1,Number of medals won
7,,AUS,Para swimming,Matthew COWDREY,Paralympic,Men,3,13,7,23,...,[{'headline': 'A Fifth Gold for Du Toit and a ...,4,"[-25.274398, 133.775136]",<h1>Matthew COWDREY</h1><p> Represented: Austr...,Point,Australia,Matthew COWDREY,#33B5FF,1,Number of medals won
8,,NOR,Para swimming,Erling TRONDSEN,Paralympic,Men,1,13,6,20,...,0,0,"[60.472024, 8.468946]",<h1>Erling TRONDSEN</h1><p> Represented: Norwa...,Point,Norway,Erling TRONDSEN,#33B5FF,1,Number of medals won
9,,USA,Para athletics,Bart DODSON,Paralympic,Men,4,13,3,20,...,0,0,"[37.09024, -95.712891]",<h1>Bart DODSON</h1><p> Represented: United St...,Point,United States,Bart DODSON,#33B5FF,1,Number of medals won


## Add radius column (tweak the numbers later by linking the size of the radius to the number of articles or medals)

* Initialise df_repeated['properties.radius'] = 0
* For half of the rows, make the radius correspond to medal count, and the other half to article count.

In [37]:
df_repeated['properties.radius'] = 0

In [38]:
# df_repeated.hits.value_counts()

In [39]:
hits_list = df.hits.to_list()
radius_list = []

for hits in hits_list:
    if hits == 0:
        radius = 3
    elif hits < 10:
        radius = 4
    elif hits < 30:
        radius = 5
    elif hits < 70:
        radius = 6.5
    elif hits < 100:
        radius = 7
    elif hits < 300:
        radius = 15
    elif hits < 700:
        radius = 20
    elif hits < 800:
        radius = 35
    else:
        radius = 45
    radius_list.append(radius)
# radius_list

In [40]:
df_repeated.loc[145:,'properties.radius'] = radius_list
df_repeated.tail(145)

Unnamed: 0,alternate_name,citizenship,event,full_name,game_type,gender,medals_bronze,medals_gold,medals_silver,medals_total,...,hits,geometry.coordinates,properties.article,geometry.type,properties.name,properties.headline,properties.color,properties.group_id,properties.group_name,properties.radius
145,,SWE,Para shooting,Jonas JAKOBSSON,Paralympic,Men,8,17,2,27,...,0,"[60.128161, 18.643501]",<h1>Jonas JAKOBSSON</h1><p> Represented: Swede...,Point,Sweden,Jonas JAKOBSSON,#33B5FF,2,Number of NYT articles,3.0
146,,ITA,Wheelchair fencing,Roberto MARSON,Paralympic,Men,3,16,7,26,...,0,"[41.87194, 12.56738]",<h1>Roberto MARSON</h1><p> Represented: Italy<...,Point,Italy,Roberto MARSON,#33B5FF,2,Number of NYT articles,3.0
147,,GBR,Para swimming,Mike KENNY,Paralympic,Men,0,16,0,16,...,1,"[55.378051, -3.435973]",<h1>Mike KENNY</h1><p> Represented: United Kin...,Point,United Kingdom,Mike KENNY,#33B5FF,2,Number of NYT articles,4.0
148,,BRA,Para swimming,Daniel DIAS,Paralympic,Men,3,14,7,24,...,0,"[-14.235004, -51.92528]",<h1>Daniel DIAS</h1><p> Represented: Brazil</p...,Point,Brazil,Daniel DIAS,#33B5FF,2,Number of NYT articles,3.0
149,,SUI,Para athletics,Heinz FREI,Paralympic,Men,6,14,6,26,...,3,"[46.818188, 8.227511999999999]",<h1>Heinz FREI</h1><p> Represented: Switzerlan...,Point,Switzerland,Heinz FREI,#33B5FF,2,Number of NYT articles,4.0
150,,SUI,Para athletics | handcycling,Franz NIETLISPACH,Paralympic,Men,2,14,6,22,...,0,"[46.818188, 8.227511999999999]",<h1>Franz NIETLISPACH</h1><p> Represented: Swi...,Point,Switzerland,Franz NIETLISPACH,#33B5FF,2,Number of NYT articles,3.0
151,,CAN,Para swimming,Michael EDGSON,Paralympic,Men,0,14,2,16,...,0,"[56.130366, -106.346771]",<h1>Michael EDGSON</h1><p> Represented: Canada...,Point,Canada,Michael EDGSON,#33B5FF,2,Number of NYT articles,3.0
152,,AUS,Para swimming,Matthew COWDREY,Paralympic,Men,3,13,7,23,...,4,"[-25.274398, 133.775136]",<h1>Matthew COWDREY</h1><p> Represented: Austr...,Point,Australia,Matthew COWDREY,#33B5FF,2,Number of NYT articles,4.0
153,,NOR,Para swimming,Erling TRONDSEN,Paralympic,Men,1,13,6,20,...,0,"[60.472024, 8.468946]",<h1>Erling TRONDSEN</h1><p> Represented: Norwa...,Point,Norway,Erling TRONDSEN,#33B5FF,2,Number of NYT articles,3.0
154,,USA,Para athletics,Bart DODSON,Paralympic,Men,4,13,3,20,...,0,"[37.09024, -95.712891]",<h1>Bart DODSON</h1><p> Represented: United St...,Point,United States,Bart DODSON,#33B5FF,2,Number of NYT articles,3.0


In [41]:
# df.sort_values('medals_total')

In [42]:
medal_list = df.medals_total.to_list()
# medal_list

In [43]:
radius_list = []

for count in medal_list:
    if count < 29 :
        radius = count/2
    else:
        radius = 20
    radius_list.append(radius)
# radius_list

In [44]:
df_repeated.loc[:144,'properties.radius'] = radius_list
# df_repeated

In [45]:
df_repeated[df_repeated['full_name'] == 'Michael PHELPS']['article_results']

116    [{'headline': 'Michael Phelps, and His Friends...
261    [{'headline': 'Michael Phelps, and His Friends...
Name: article_results, dtype: object

Note: I manually edited the radius for outliers (Zorn, Phelps, etc) in the final geojson file to make final adjustments

## Get rid of extra columns so that I have clean dataframe that can be transformed into geojson

In [46]:
df_repeated.columns

Index(['alternate_name', 'citizenship', 'event', 'full_name', 'game_type',
       'gender', 'medals_bronze', 'medals_gold', 'medals_silver',
       'medals_total', 'other_info', 'season', 'years', 'country_name',
       'article_results', 'hits', 'geometry.coordinates', 'properties.article',
       'geometry.type', 'properties.name', 'properties.headline',
       'properties.color', 'properties.group_id', 'properties.group_name',
       'properties.radius'],
      dtype='object')

In [47]:
df_repeated = df_repeated.drop(columns=['alternate_name', 'citizenship','event', 'full_name', 'game_type',\
       'gender', 'medals_bronze', 'medals_gold', 'medals_silver',\
       'medals_total', 'other_info', 'season', 'years', 'country_name',\
       'article_results', 'hits'])
# df_repeated

In [48]:
df['properties.color'].value_counts(dropna=False)

#33B5FF    80
#FFE333    65
Name: properties.color, dtype: int64

## Save the geojson dataframe as a csv

In [49]:
df_repeated.loc[116]['properties.article']

'<h1>Michael PHELPS</h1><p> Represented: United States</p><p> Competed in the Summer Olympics from 2004-2016</p><p> Event: aquatics</p><p>Total medals won: 28</p><p> Gold: 23, Silver: 3, Bronze: 2</p><p>Number of articles in The New York Times: 1000</p>Top search results:<div id="story"><p><a href="https://www.nytimes.com/2017/09/23/insider/michael-phelps-grant-hackett-friendship.html">Michael Phelps, and His Friends</a></p></div><div id="story"><p><a href="https://www.nytimes.com/2017/09/21/sports/michael-phelps-grant-hackett-tiger-woods.html">Michael Phelps: A Golden Shoulder to Lean On</a></p></div><div id="story"><p><a href="https://www.nytimes.com/aponline/2019/06/30/us/ap-history.html">Today in History</a></p></div><div id="story"><p><a href="https://www.nytimes.com/2019/04/13/sports/tiger-woods-masters-augusta.html">Tiger Woods, in Range of Masters Win, Displays His Dominance for New Generation</a></p></div><div id="story"><p><a href="https://www.nytimes.com/2018/07/27/sports/mi

In [50]:
df_repeated.to_csv('geojson_df.csv', index=False)