# Prepare datasets for EDA

This script creates 4 separate dataframes from the raw data which are then analyzed in the EDA. 

In [3]:
import pandas as pd
import os
import numpy as np
import datetime 
import decimal

## Load Raw Data

In [11]:
## load full dataset
df_kuzu_zug_raw = pd.read_feather('../data/Data_Raw_Kuzu_Zug')
#list(df_kuzu_zug_raw.columns)

## Tranform Data

#### Kommentar without Label

In [12]:
df_text = df_kuzu_zug_raw[["participant",
                           "u_date",
                           "Kommentar",
                           "S_sprache"]]

df_text = df_text[df_text["Kommentar"]!="-66"]
df_text = df_text[df_text["S_sprache"]!="0"]
df_text = df_text[df_text.Kommentar.notnull()]
df_text = df_text[df_text.u_date.notnull()]
df_text = df_text[df_text.Kommentar.apply(lambda x: len(str(x))>=4)] # min 4 characters for valid comment
df_text["u_date"] = pd.to_datetime(df_text["u_date"])
df_text.sort_values(by='u_date', inplace=True)

# col Naming and order
df_text.rename(columns = {'participant':'ParticipantId','Kommentar':'Value','S_sprache':'Language','u_date':'Date'}, inplace = True)

## Reset Index 
df_text = df_text.reset_index(drop=True)


#### Kommentare with Label

In [13]:
# load config file and create mask for filtering
labeled_kuzu_zug = pd.read_excel(open('../config/config.xlsx', 'rb'),sheet_name='Fragecodes')
labeled_kuzu_zug =list(labeled_kuzu_zug.loc[(labeled_kuzu_zug['UmfrageName']=='kuzu_zug') & (labeled_kuzu_zug['FrageType']== 'text_labeled')].reset_index()['FrageCode']) #select all codes of satisfaction questions
labeled_kuzu_zug.insert(0, "participant") #add participant for mapping

# filter df
df_text_labeled = df_kuzu_zug_raw[labeled_kuzu_zug]

# transform to long
df_text_labeled = pd.melt(df_text_labeled.reset_index(), id_vars='participant',value_vars=labeled_kuzu_zug, var_name='FrageCode', value_name='Value').reset_index(drop=True)


# join Date and Language
df_text_labeled = pd.merge(df_text_labeled, df_kuzu_zug_raw[['u_date','participant','S_sprache']],
                           left_on=  ['participant'],
                           right_on= ['participant'],
                           how = 'left')
# cleaining
df_text_labeled = df_text_labeled[df_text_labeled["Value"]!="-66"]
df_text_labeled = df_text_labeled[df_text_labeled.Value.notnull()]
df_text_labeled = df_text_labeled[df_text_labeled.Value.apply(lambda x: len(str(x))>=4)] # min 4 characters for valid comment
df_text_labeled = df_text_labeled[df_text_labeled["S_sprache"]!="0"]

# convert correct datatype
df_text_labeled = df_text_labeled[df_text_labeled.u_date.notnull()]
df_text_labeled["u_date"] = pd.to_datetime(df_text_labeled["u_date"])

# Order cols
df_text_labeled = df_text_labeled.reindex(['participant','u_date','FrageCode','Value','S_sprache'], axis=1)

# Col renaming
df_text_labeled.rename(columns = {'participant':'ParticipantId','u_date':'Date','S_sprache':'Language'}, inplace = True)


## Reset Index 
df_text_labeled = df_text_labeled.reset_index(drop=True)


#### Satisfaction

In [14]:
# Load config file and create mask for filtering
satisfaction_kuzu_zug = pd.read_excel(open('../config/config.xlsx', 'rb'),sheet_name='Fragecodes')
satisfaction_kuzu_zug =list(satisfaction_kuzu_zug.loc[(satisfaction_kuzu_zug['UmfrageName']=='kuzu_zug') & (satisfaction_kuzu_zug['FrageType']== 'satisfaction')].reset_index()['FrageCode']) #select all codes of satisfaction questions
satisfaction_kuzu_zug.insert(0, "participant") #add participant for mapping

# Filter df
df_satisfaction = df_kuzu_zug_raw[satisfaction_kuzu_zug]

# transform to long
df_satisfaction = pd.melt(df_satisfaction.reset_index(), id_vars='participant',value_vars=satisfaction_kuzu_zug, var_name='FrageCode', value_name='Value').reset_index(drop=True)

# cleaining
df_satisfaction = df_satisfaction [df_satisfaction.Value.isin([str(i) for i in range(1,11)])]

# join Date
df_satisfaction = pd.merge(df_satisfaction, df_kuzu_zug_raw[['u_date','participant']],
                           left_on=  ['participant'],
                           right_on= ['participant'],
                           how = 'left')

# convert correct datatype
df_satisfaction = df_satisfaction[df_satisfaction.u_date.notnull()]
df_satisfaction["u_date"] = pd.to_datetime(df_satisfaction["u_date"])
df_satisfaction.sort_values(by='u_date', inplace=True)
df_satisfaction["Value"] = df_satisfaction["Value"].astype(str).astype(int)

# Transform from 5 and 10 scaling to 1harmonized 100 scale (10 until 03.2020 and 5 since 5.2020 - april 2020 no data)
mask_10 = (df_satisfaction['u_date'] > '2000-1-1') & (df_satisfaction['u_date'] <= '2020-4-30')
mask_5 = (df_satisfaction['u_date'] >= '2020-5-1')
df_satisfaction_10 = df_satisfaction.loc[mask_10]
df_satisfaction_10 = df_satisfaction_10.assign(Value=df_satisfaction_10['Value'].apply(lambda x:(x-1)/9*100))
df_satisfaction_5 = df_satisfaction.loc[mask_5]
df_satisfaction_5 = df_satisfaction_5.assign(Value=df_satisfaction_5['Value'].apply(lambda x:(x-1)/4*100))
df_satisfaction = pd.concat([df_satisfaction_5, df_satisfaction_10])

# Order cols
df_satisfaction = df_satisfaction.reindex(['participant','u_date','FrageCode','Value'], axis=1)

# Col renaming
df_satisfaction.rename(columns = {'participant':'ParticipantId','u_date':'Date'}, inplace = True)

# Change dtypes and round float
df_satisfaction["FrageCode"] = df_satisfaction["FrageCode"].astype('category')
df_satisfaction["Value"] = df_satisfaction["Value"].round(3)

## Sort df by date
df_satisfaction = df_satisfaction.sort_values("Date",ascending=False)

## Reset Index 
df_satisfaction = df_satisfaction.reset_index(drop=True)



#### Additional Meta data

In [15]:
## Filter df
df_meta = df_kuzu_zug_raw[["participant",
                           "u_date",
                           "S_sex",
                           "S_alter",
                           "u_bezugsart",
                           "u_fahrausweis",
                           "u_ga",
                           "u_klassencode",
                           "u_preis",
                           "u_ticket",
                           "device_type",
                           "dispcode",
                           "fg_startort",
                           "fg_abfahrt",
                           "ft_startort",
                           "ft_abfahrt",
                           "ft_zielort",
                           "ft_ankunft",
                           "fg_zielort",
                           "fg_ankunft",
                           "ft_haltestellen",
                           "ft_vm_kurz",
                           "R_anschluss",
                           "R_stoerung",
                           "R_zweck",
                           "S_berufstaetigkeit",
                           "S_wohnsitz",
                           "S_Usertyp1",
                           "S_Usertyp2",
                           "S_Usertyp3"
                          ]]

## convert empty invalid anwers
df_meta = df_meta.replace('-66', np.nan)
df_meta = df_meta.replace('-99', np.nan)
df_meta = df_meta.replace('-77', np.nan)
df_meta = df_meta.replace('0', np.nan)
df_meta = df_meta.replace('Weiss nicht', np.nan)
df_meta = df_meta[df_meta.u_date.notnull()]


## convert correct datatype
df_meta["u_date"] = pd.to_datetime(df_meta["u_date"])
df_meta["fg_abfahrt"] = pd.to_datetime(df_meta["fg_abfahrt"])
df_meta["ft_abfahrt"] = pd.to_datetime(df_meta["ft_abfahrt"])
df_meta["ft_ankunft"] = pd.to_datetime(df_meta["ft_ankunft"])
df_meta["fg_ankunft"] = pd.to_datetime(df_meta["fg_ankunft"])

df_meta["u_preis"] = pd.to_numeric(df_meta["u_preis"], downcast='float')
df_meta['S_alter'] = pd.to_numeric(df_meta["S_alter"], downcast='float')

tocategory = ['S_sex',
              'u_bezugsart',
              'u_fahrausweis',
              'u_ga',
              'u_klassencode',
              'u_ticket',
              "device_type",
              "dispcode",
              "fg_startort",
              "ft_startort",
              "ft_zielort",
              "fg_zielort",
              "ft_haltestellen",
              "ft_vm_kurz",
              "R_anschluss",
              "R_stoerung",
              "R_zweck",
              "S_berufstaetigkeit",
              "S_wohnsitz",
              "S_Usertyp1",
              "S_Usertyp2",
              "S_Usertyp3"
             ]

for item in tocategory:
    df_meta[item] = df_meta[item].astype('category')

## Col renaming
df_meta.rename(columns = {'participant':'ParticipantId','u_date':'Date'}, inplace = True)

## Reset Index 
df_meta = df_meta.reset_index(drop=True)

### Write dataframes to files

In [16]:
df_satisfaction.to_feather("../data/Data_Satisfaction")
df_meta.to_feather("../data/Data_Metadata")
df_text.to_feather("../data/Data_Unlabeled")
df_text_labeled.to_feather("../data/Data_Labeled")