In [1]:
import pandas as pd
import numpy as np

Since we now have our three datasets, we can merge them into a single one. 

## Input Files


- **data_withLocation.csv**
> **Columns**: id - Canton - District <br/>
9514097914,246,BE <br/>
9514846412,112,ZH <br/>
9516574359,2500,GE <br/>
9516952605,1726,SG <br/>
9517198943,2225,VD

- **sentiment.txt**
> **Columns**: id - Sentiment score <br/>
9514097914	0 <br/>
9514846412	3 <br/>
9516574359	2 <br/>
9516952605	1

- **data_dates.csv**
> **Columns**: id-YYYY-MM-DD <br/>
316216215464992770,2013,3,25 <br/>
609352873164587009,2015,6,12 <br/>
438759513676853248,2014,2,26 <br/>
402160362499612672,2013,11,17 <br/>
206689780308787201,2012,5,27 <br/>



### Data with locations

In [2]:
data_with_locations = pd.read_csv('data_created/data_withLocation.csv', names=['id','Canton','District'])
data_with_locations.index = data_with_locations.id
data_with_locations.head()

Unnamed: 0_level_0,id,Canton,District
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9514097914,9514097914,246,BE
9514846412,9514846412,112,ZH
9516574359,9516574359,2500,GE
9516952605,9516952605,1726,SG
9517198943,9517198943,2225,VD


### Data_sentiment

In [4]:
data_sentiment = pd.read_csv('data_created/sentiment.txt', names=["id", "Score"], sep='\t')
data_sentiment = data_sentiment[~data_sentiment.id.isnull()]
data_sentiment.index = data_sentiment.id
data_sentiment.head()

Unnamed: 0_level_0,id,Score
id,Unnamed: 1_level_1,Unnamed: 2_level_1
9514097914,9514097914,0.0
9514846412,9514846412,3.0
9516574359,9516574359,2.0
9516952605,9516952605,1.0
9517198943,9517198943,-1.0


> We forgot to "clean" if before. Some indexes are not valid and this will be problematic when trying to join the datasets. Therefore, we remove all entries in the *data_sentiment* datframe wich have an invalid index.

In [5]:
# Index cleaning
data_sentiment['idx'] = data_sentiment.index
data_sentiment['isIdxValid'] = data_sentiment.apply(lambda row: str(row.idx).isdigit(), axis=1)
data_sentiment = data_sentiment[data_sentiment.isIdxValid == True]
data_sentiment.index = data_sentiment.index.astype('int64')

### Data dates

In [9]:
data_dates = pd.read_csv('data_created/data_dates.csv', names=['id','Year','Month','Day'])
data_dates.index = data_dates.id
data_dates.head()

Unnamed: 0_level_0,id,Year,Month,Day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
9514097914,9514097914,2010,2,23
9514846412,9514846412,2010,2,23
9516574359,9516574359,2010,2,23
9516952605,9516952605,2010,2,23
9517198943,9517198943,2010,2,23


# Merge *Location* with *Score*

In [6]:
sorted1 = data_sentiment.sort_index(axis=1)
sorted2 = data_with_locations.sort_index(axis=1)

In [7]:
data_loca_score_merge = pd.concat([sorted1, sorted2], axis=1)

In [9]:
data_location_score = data_loca_score_merge[['Score', 'Canton', 'District']]
data_location_score = data_location_score[~data_location_score.Score.isnull()]
data_location_score = data_location_score[~data_location_score.Canton.isnull()]

print(data_location_score.isnull().any())
print(len(data_location_score))

Score       False
Canton      False
District    False
dtype: bool
19250204


# Sentiment score sign

The sentiment score we compute before can be any integer. 
We will reduce it to three possibilities: 

- -1 if the score is negative
- 0 if the score is neutral
- 1 if the score is positive

In [10]:
data_location_score['ScoreBool'] = np.sign(data_location_score.Score)
data_location_score.sample(10)

Unnamed: 0_level_0,Score,Canton,District,ScoreBool
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
666167135626993664,0.0,1503.0,AR,0.0
675242718109503488,-2.0,2225.0,VD,-1.0
490107870009389056,1.0,2603.0,JU,1.0
239269953910489088,0.0,2012.0,TG,0.0
335066244363472897,4.0,2106.0,TI,1.0
240410227235774464,-1.0,1200.0,BS,-1.0
511756976720986113,0.0,2221.0,VD,0.0
680323523588558849,1.0,2500.0,GE,1.0
612348415016587264,2.0,2225.0,VD,1.0
262160532134563840,-3.0,1003.0,FR,-1.0


# Merge with *Dates*

In [10]:
df_all = pd.concat([data_location_score, data_dates], axis=1)

In [11]:
df_all.head()

Unnamed: 0_level_0,id,Score,District,Canton,ScoreBool,id,Year,Month,Day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
9514097914,9514098000.0,0.0,246.0,BE,0.0,9514098000.0,2010.0,2.0,23.0
9514846412,9514846000.0,3.0,112.0,ZH,1.0,9514846000.0,2010.0,2.0,23.0
9516574359,9516574000.0,2.0,2500.0,GE,1.0,9516574000.0,2010.0,2.0,23.0
9516952605,9516953000.0,1.0,1726.0,SG,1.0,9516953000.0,2010.0,2.0,23.0
9517198943,9517199000.0,-1.0,2225.0,VD,-1.0,9517199000.0,2010.0,2.0,23.0


# Data for Viz

Now that we have the complete dataset, we export it into a csv file and can build our visualization map !

In [13]:
data_to_export = df_all[['District', 'Canton', 'Year', 'Month', 'Day', 'Score', 'ScoreBool']]
name = 'data_created/data_ready_for_viz.csv'
data_to_export.to_csv(name, header=False)