In [61]:
import pandas as pd
import os
import logging

logging.basicConfig(filename='log.log', level=logging.INFO)

In [69]:
base_url = r"./data_original/"
dest_url = r"./data_cleaned/"
files = []
for file in os.listdir(base_url):
    if file.endswith(".csv"):
        files.append(file)

print(files)

['1001-3000 study reports.csv', '101-1000 study reports.csv', '3 berichte in den 5000er bereichen study reports.csv', '3001-4170 study reports.csv', '4172-4405 study reports.csv', '4407-4637 study reports.csv', '4639-4937 study reports.csv', '5380 - ende 7424 normale version reichen study reports.csv', 'first 100 study reports.csv', 'short version study reports.csv', 'short version study reports_copy.csv']


#### Preprocessing and structuring of each sub-dataset

In [72]:
#Some entries have another format regarding their metainformation.
#There is another column that wasnt handeled by the crawler,
#therefore some entries have a right shift of the values "Studiengang" and "Fakultaet".
#To correct this, there is the function correct_right_shift().
#However, the "Verfasser-ID" wasnt scraped in these cases but its a neglectable value and is therefore not corrected.

def correct_right_shift(df):
    for index, row in df.iterrows():
        try:
            if(row["Verfasser-ID"].isnumeric()) is not True:
                copy_studiengang = row["Verfasser-ID"]
                copy_fakultaet = row["Studiengang"]
                df.loc[index, "Studiengang"] = copy_studiengang
                df.loc[index, "Fakultaet"] = copy_fakultaet
                df.loc[index, "Verfasser-ID"] = "NO AUTHOR ID FROM SCRAPER"
        except AttributeError:
            print("Attribute Error in: " + str(index) + " Column is already a number")

In [63]:
# Some entries have another format regarding their main information.
# The columns "Budget" and "Finanzierung" arent there and therefore need to be added as a empty column to ensure alignment

def add_budget_and_finance_column(df):
    if ("Budget" not in df.columns):
        df["Budget"] = "NO BUDGET COLUMN IN REPORT"
    if ("Finanzierung" not in df.columns):
        df["Finanzierung"] = "NO FINANCE COLUMN IN REPORT"

In [82]:
# The crawler added an index column, however, since all files need to be concatenated, we need an overall index.
# Therefore the index from each file needs to be removed

def drop_index_column(df):
    df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [81]:
for file in files:
    print(file)
    url = base_url + file
    df = pd.read_csv(url, sep=";")
    correct_right_shift(df)
    add_budget_and_finance_column(df)
    drop_index_column(df)
    df.to_csv(dest_url + file, sep=";", index=False)

1001-3000 study reports.csv
101-1000 study reports.csv
3 berichte in den 5000er bereichen study reports.csv
3001-4170 study reports.csv
Attribute Error in: 0 Column is already a number
Attribute Error in: 1 Column is already a number
Attribute Error in: 2 Column is already a number
Attribute Error in: 3 Column is already a number
Attribute Error in: 4 Column is already a number
Attribute Error in: 5 Column is already a number
Attribute Error in: 6 Column is already a number
Attribute Error in: 7 Column is already a number
Attribute Error in: 8 Column is already a number
Attribute Error in: 9 Column is already a number
Attribute Error in: 10 Column is already a number
Attribute Error in: 11 Column is already a number
Attribute Error in: 12 Column is already a number
Attribute Error in: 13 Column is already a number
Attribute Error in: 14 Column is already a number
Attribute Error in: 15 Column is already a number
Attribute Error in: 16 Column is already a number
Attribute Error in: 17 C

FileNotFoundError: [Errno 2] No such file or directory: './data_original/short version study reports_copy.csv'

#### Merging all sub-datasets together

In [87]:
files = []
for file in os.listdir(dest_url):
    if file.endswith(".csv"):
        files.append(file)

df = pd.DataFrame()
for file in files:
    df_help = pd.read_csv(dest_url + file, sep=";")
    df = pd.concat([df, df_help])

df.to_csv(dest_url + "final.csv", index=False, sep=";")