In [1]:
import sys
import os
sys.path.append(os.path.abspath('../')) ## needed to import the function.py file

import pandas as pd
from functions import *
import timeit

start = timeit.default_timer()
print(start)

## Import dataframe
filelocation = '../../data/DataClean'
df = pd.read_feather(filelocation)

###### Stopword list creation #########

# import custom stopwords list
customstopwords = pd.read_excel('../../config/customstopwords.xlsx')
customstopwords = customstopwords['stopword'].tolist()

# Also add ortsnamen to the stoplist because we have them in the metadata and dont want them in the comments
orte = [x.lower() for x in set(df.ft_startort.tolist()) if x == x and x.lower() != '']

# Create the list of locations
for location in df.ft_startort.tolist():
    # Check if the value is a string
    if isinstance(location, str):
        # Convert to lowercase and remove 'Zug'
        location = location.lower()
        if location == 'zug':
            continue
        
        # Split the location into tokens if it contains whitespace
        tokens = location.split()
        
        # Add each token to the list individually
        for token in tokens:
            # Skip any token that is in the stoplist
            if token in orte:
                continue
            # Remove any commas from the end of the token
            token = token.rstrip(',')
            orte.append(token)
    
# Remove duplicates from the list
orte = list(set(orte))

orte.remove("zug")

# extend the stopword list with the ortsnamen
customstopwords.extend(orte)

## Keep only surveys with filled out "Kommentar"
df_text = df.dropna(subset=["Kommentar"])

#df_text = df_text.head(2000)
print(len(df_text))

df_text = df_text[df_text.Kommentar.apply(lambda x: len(str(x))>=3)] # min 3 characters for valid comment
df_text.reset_index(inplace=True, drop=True)


## Add basic text features
df_text["Kommentar"] = remove_redundant_whitespaces(df_text["Kommentar"]) #note: imported function "remove_redundant_whitespaces"
df_text = add_basic_textfeatures(df_text,"Kommentar")

## Preprocess text
#preprocess_text(df_text, 'Kommentar', locations=didok)
preprocess_text(df_text, 'Kommentar', custom_stopwords=customstopwords)


# Add Additional data columns for better slicing
add_date_columns(df_text, 'u_date')

# Sort dataframe by date (newest first)
df_text = df_text.sort_values("u_date",ascending=False)


############## Export ############## 
df_text = df_text.reset_index(drop=True)
df_text.to_feather('../../data/DataText') # store data in feather file

end = timeit.default_timer()
print("Duration: ",end-start)


11.124351582
64648
Duration:  379.64587565700003
