In [1]:
import pandas as pd
import csv

In [2]:
data_path = "Data/sample.tsv"
schema_path = "Data/schema.txt"

In [3]:
sample =  pd.read_csv(data_path, sep="\t",encoding='utf-8',  quoting=csv.QUOTE_NONE,escapechar='\\')

In [4]:
schema = pd.read_csv(schema_path, sep="\s+", header =None)

In [5]:
schema.drop(labels = 0, axis =1, inplace= True)

In [6]:
schema[1]

0                 id
1             userId
2          createdAt
3               text
4          longitude
5           latitude
6            placeId
7          inReplyTo
8             source
9          truncated
10     placeLatitude
11    placeLongitude
12        sourceName
13         sourceUrl
14          userName
15        screenName
16    followersCount
17      friendsCount
18     statusesCount
19      userLocation
Name: 1, dtype: object

In [7]:
sample.columns = schema[1]

In [8]:
sample.head()

1,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,placeLatitude,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation
0,776523000636203010,2741685639,2016-09-15 20:48:05,https://t.co/noYrTnqmg9,N,N,4e7c21fd2af027c6,N,1,,46.8131,8.22414,Twitter for iPhone,http://twitter.com/#!/download/iphone,samara,letisieg,755,2037,3771,Suisse
1,776523045200691200,435239151,2016-09-15 20:48:15,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,N,N,12eb9b254faf37a3,776522113859608576,5,,47.201,5.94082,Twitter for Android,http://twitter.com/download/android,lebrübrü❤,lebrubru,811,595,30191,Fontain
2,776523058404290560,503244217,2016-09-15 20:48:18,@Mno0or_Abyat اشوف مظاهرات على قانون العمل الج...,N,N,30bcd7f767b4041e,776521597515624448,1,,45.8011,6.16552,Twitter for iPhone,http://twitter.com/#!/download/iphone,عبدالله القنيص,bingnais,28433,417,12262,Shargeyah
3,776523058504925185,452805259,2016-09-15 20:48:18,Greek night #geneve (@ Emilios in Genève) http...,6.14414,46.1966,c3a6437e1b1a726d,N,3,,46.2048,6.14319,foursquare,http://foursquare.com,Alkan Şenli,Alkanoli,204,172,3390,İstanbul/Burgazada
4,776523071025012736,16416746,2016-09-15 20:48:21,@gregorypons #BusinessMontresVision https://t....,N,N,c3a6437e1b1a726d,776520907107995652,18777,,46.2048,6.14319,Twitter Web Client,http://twitter.com,Gregory PONS,gregorypons,2398,305,14917,Geneva + watchmaking planet


In [9]:
sample.shape

(8789, 20)

In [10]:
sample.dropna(subset=['createdAt','placeLatitude','placeLongitude','userId','id','screenName'],inplace=True)

In [11]:
sample['placeLatitude']=pd.to_numeric(sample['placeLatitude'],errors='coerce')
sample['placeLongitude']=pd.to_numeric(sample['placeLongitude'],errors='coerce')

In [12]:
sample.shape

(8789, 20)

##  PREPROCESSING 

#### change the date format and add in the same time the number of tweets per day

In [13]:
from datetime import datetime

In [14]:
date_format = "%Y-%m-%d %H:%M:%S"

In [15]:
sample['createdAt'] = pd.to_datetime(sample['createdAt'],format = date_format,errors='coerce')

####  add the number of tweets per day

In [16]:
freq_per_day = sample.createdAt.dt.date.value_counts()

In [17]:
def get_freq_day(df):
    date_row = df['createdAt'].date()
    return freq_per_day[date_row]
    

In [18]:
sample['freq_day'] = sample.apply(get_freq_day,axis=1)

In [19]:
new_date_format = '%Y-%m-%dT%H:%M:%SZ'

In [20]:
def change_date_format(ds):
    return ds.strftime('%Y-%m-%dT%H:%M:%SZ')
    

In [21]:
sample['createdAt'] = sample['createdAt'].apply(change_date_format)

In [22]:
sample = sample.reindex()

In [23]:
sample.head()

1,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,source,truncated,...,placeLongitude,sourceName,sourceUrl,userName,screenName,followersCount,friendsCount,statusesCount,userLocation,freq_day
0,776523000636203010,2741685639,2016-09-15T20:48:05Z,https://t.co/noYrTnqmg9,N,N,4e7c21fd2af027c6,N,1,,...,8.22414,Twitter for iPhone,http://twitter.com/#!/download/iphone,samara,letisieg,755,2037,3771,Suisse,1585
1,776523045200691200,435239151,2016-09-15T20:48:15Z,@BesacTof @Leonid_CCCP Tu dois t'engager en si...,N,N,12eb9b254faf37a3,776522113859608576,5,,...,5.94082,Twitter for Android,http://twitter.com/download/android,lebrübrü❤,lebrubru,811,595,30191,Fontain,1585
2,776523058404290560,503244217,2016-09-15T20:48:18Z,@Mno0or_Abyat اشوف مظاهرات على قانون العمل الج...,N,N,30bcd7f767b4041e,776521597515624448,1,,...,6.16552,Twitter for iPhone,http://twitter.com/#!/download/iphone,عبدالله القنيص,bingnais,28433,417,12262,Shargeyah,1585
3,776523058504925185,452805259,2016-09-15T20:48:18Z,Greek night #geneve (@ Emilios in Genève) http...,6.14414,46.1966,c3a6437e1b1a726d,N,3,,...,6.14319,foursquare,http://foursquare.com,Alkan Şenli,Alkanoli,204,172,3390,İstanbul/Burgazada,1585
4,776523071025012736,16416746,2016-09-15T20:48:21Z,@gregorypons #BusinessMontresVision https://t....,N,N,c3a6437e1b1a726d,776520907107995652,18777,,...,6.14319,Twitter Web Client,http://twitter.com,Gregory PONS,gregorypons,2398,305,14917,Geneva + watchmaking planet,1585


In [37]:
sample.userLocation.head()

0                         Suisse
1                        Fontain
2                      Shargeyah
3             İstanbul/Burgazada
4    Geneva + watchmaking planet
Name: userLocation, dtype: object

###  get the city/state

In [38]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()

In [54]:
coord = str(sample.placeLatitude[3]) +"," + str(sample.placeLongitude[3])

In [55]:
location = geolocator.reverse(coord)

In [56]:
location.raw

{u'address': {u'bus_stop': u'Bel-Air',
  u'city': u'Gen\xe8ve',
  u'country': u'Schweiz, Suisse, Svizzera, Svizra',
  u'country_code': u'ch',
  u'pedestrian': u'Place Philibert-Berthelier',
  u'postcode': u'1204',
  u'state': u'Gen\xe8ve',
  u'suburb': u'Cit\xe9'},
 u'boundingbox': [u'46.2045887', u'46.2047887', u'6.1430872', u'6.1432872'],
 u'display_name': u'Bel-Air, Place Philibert-Berthelier, Cit\xe9, Gen\xe8ve, 1204, Schweiz, Suisse, Svizzera, Svizra',
 u'lat': u'46.2046887',
 u'licence': u'Data \xa9 OpenStreetMap contributors, ODbL 1.0. http://www.openstreetmap.org/copyright',
 u'lon': u'6.1431872',
 u'osm_id': u'768498774',
 u'osm_type': u'node',
 u'place_id': u'7097094'}

#### we add the state in a new column because sometimes there're not the city in raw

In [57]:
location.raw['address']['state']

u'Gen\xe8ve'

In [65]:
def get_state(df):
    state_list = []
    list_lat = sample.placeLatitude
    list_longi = sample.placeLongitude
    for lat in list_lat:
        for longi in list_longi:
            coord =str(lat)+","+str(longi)
            location = geolocator.reverse(coord)
            state_list.append(location.raw['address']['state'])

In [66]:
sample.apply(get_state,axis = 1)

GeocoderServiceError: ('HTTP Error 429: Too Many Requests', u'occurred at index 0')

In [None]:
sample.columns

on the json we'll keep id, userId, createdAt, placeLongitude, placeLatitude, screenName, and add 2 features: total number of tweets per day, number of tweets per country?

## finally we'll get a json file like that 

In [30]:
{"type":"Feature","id":"776523045200691200","geometry":{"type":"Point","coordinates":[47.201,5.94082]},"geometry_name":"tweet_information","properties":{"userid":"3380927","createdAt":"2010-09-30T19:59:10Z","placeLongitude":5.94082,"placeLatitude":47.201,"screenName":"lebrubru"}}

{'geometry': {'coordinates': [47.201, 5.94082], 'type': 'Point'},
 'geometry_name': 'tweet_information',
 'id': '776523045200691200',
 'properties': {'createdAt': '2010-09-30T19:59:10Z',
  'placeLatitude': 47.201,
  'placeLongitude': 5.94082,
  'screenName': 'lebrubru',
  'userid': '3380927'},
 'type': 'Feature'}

In [31]:
import json

In [32]:
# gjson is th emain dictionary
gjson_dict={}
gjson_dict["type"]= "FeatureCollection"
feat_list = []


In [33]:
gjson_dict

{'type': 'FeatureCollection'}

In [34]:
from geojson import Point

In [35]:

# Loop through all the courses, building a list entry which is itself a dictionary
# Each of these dictionaries has nested within it a type dictionary, which contains a point dictionary and a properties dictionary 
for index in sample.index:
    type_dict = {}
    pt_dict = {}
    prop_dict = {}
    type_dict["type"]= "Feature"
    pt_dict["type"]="Point"

    # GEOJSON looks for long,lat so reverse order
    type_dict["geometry"]=Point((sample.loc[index].placeLongitude,sample.loc[index].placeLatitude))
    
    prop_dict["freq_day"] = sample.loc[index].freq_day
    prop_dict["userId"] = sample.loc[index].userId
    prop_dict["createdAt"]= sample.loc[index].createdAt
    prop_dict["placeLongitude"]= sample.loc[index].placeLongitude
    prop_dict["placeLatitude"]= sample.loc[index].placeLatitude
    prop_dict["screenName"] = sample.loc[index].screenName
    type_dict["properties"]=prop_dict
    feat_list.append(type_dict)
    
gjson_dict["features"] = feat_list



In [67]:
gjson_dict

{'features': [{'geometry': {"coordinates": [8.2241400000000002, 46.813099999999999], "type": "Point"},
   'properties': {'createdAt': '2016-09-15T20:48:05Z',
    'freq_day': 1585,
    'placeLatitude': 46.813099999999999,
    'placeLongitude': 8.2241400000000002,
    'screenName': u'letisieg',
    'userId': 2741685639},
   'type': 'Feature'},
  {'geometry': {"coordinates": [5.9408199999999995, 47.201000000000001], "type": "Point"},
   'properties': {'createdAt': '2016-09-15T20:48:15Z',
    'freq_day': 1585,
    'placeLatitude': 47.201000000000001,
    'placeLongitude': 5.9408199999999995,
    'screenName': u'lebrubru',
    'userId': 435239151},
   'type': 'Feature'},
  {'geometry': {"coordinates": [6.1655199999999999, 45.801099999999998], "type": "Point"},
   'properties': {'createdAt': '2016-09-15T20:48:18Z',
    'freq_day': 1585,
    'placeLatitude': 45.801099999999998,
    'placeLongitude': 6.1655199999999999,
    'screenName': u'bingnais',
    'userId': 503244217},
   'type': 'Featu

In [68]:

# Serialize JSON for writing to file
json_path = './Data/json_sample.js'
with open(json_path, 'w') as outfile:
     json.dump(gjson_dict, outfile, sort_keys = True, indent = 4, ensure_ascii=False)
