Feature Engineering

In [2]:
import json
from collections import OrderedDict

# Lade die Daten
def load_data(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        loaded_data = json.load(file)
    return loaded_data

# Sonderzeichen behandeln
def handle_special_characters(value):
    if isinstance(value, str):  # Überprüfen, ob der Wert ein String ist
        # Umlaute umwandeln
        value = value.replace("ß", "ss")
        value = value.replace("ä", "ae")
        value = value.replace("Ä", "Ae")
        value = value.replace("Ü", "Ue")
        value = value.replace("ü", "ue")
        value = value.replace("ö", "oe")
    return value

# Sonderzeichen in den geladenen Daten ersetzen
def handle_special_chars_in_data(loaded_data):
    decoded_data = []
    for dictionary in loaded_data:
        new_dict = {}
        for key, value in dictionary.items():
            new_dict[key] = handle_special_characters(value)
        decoded_data.append(new_dict)
    return decoded_data

# Speichern der endgültigen Daten in eine JSON-Datei
def save_to_json(data, output_filename):
    with open(output_filename, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=2)

# Daten splitten
def handle_datasplit(loaded_data):
    splitted_data = []
    for data in loaded_data:
        new_dict = {}
        for key, value in data.items():
            new_dict[key] = change_values_to_zero_one(key, value)
        splitted_data.append(new_dict)
    return splitted_data

# Daten in Integer 0,1 umwandeln
def change_values_to_zero_one(key, value):
    if isinstance(value, bool): # Überprüfen ob der Wert Boolean ist
        value = int(value)
    return value

# Daten transformieren
def splitdata(loaded_data, attributeList):
    attribute_Map =  OrderedDict()  # Map mit gewählten Attributen aufbauen 
    for attribute in attributeList: 
        attribute_Map[attribute] = []  
    for eintrag in loaded_data: # Map mit values füllen
        for attribute in attributeList:
            value = attribute_Map[attribute]
            if str(eintrag[attribute]) not in value: # Überprüfen auf doppelte Werte
                value.append(eintrag[attribute])
            attribute_Map[attribute] = value
    for eintrag in loaded_data:  
        for key in attribute_Map:
            for value in attribute_Map[key]:
                eintrag[key + ' ' + str(value)] = 0 # Inital auf Null setzen
            eintrag[key + ' ' + str(eintrag[key])] = 1
            eintrag.pop(key)    
    return loaded_data  

# Attribute entfernen
def pop_Attribute(loaded_data, attributeList):
    for eintrag in loaded_data:
        for attribute in attributeList:
            eintrag.pop(attribute, None)     
    return loaded_data
 
def main():
    input_filename = "trainingData_with_propertyAge.json"
    output_filename = "trainingData_with_city.json"
    
    # Lade die Daten ein
    loaded_data = load_data(input_filename)

    # Handle special characters
    loaded_data  = handle_special_chars_in_data(loaded_data)

    # Umwandeln der Attribute
    loaded_data = splitdata(loaded_data, ['bundesland', 'houseType'])
    loaded_data = handle_datasplit(loaded_data)
    
    # Entfernen von Attributen
    loaded_data = pop_Attribute(loaded_data, ['stadtteil', 'plz', 'strasse', '_class'])
        
    # Speichern der endgültigen Daten in eine JSON-Datei
    save_to_json(loaded_data, output_filename)

if __name__ == "__main__":
    main()

Lineare Regression

In [12]:
import json
import numpy as np
from sklearn.linear_model import LinearRegression

# Trainingsdaten einlesen
with open('short_data.json', 'r') as file:
    training_data = json.load(file)

# Trainingsdaten in separate Arrays konvertieren
X_train = np.array([[data['roomCount'], data['propertyAge'], data['livingSpace'], data['hasBasement'],
                     data['hasBalcony'], data['parkingLotCount'], data['hasGarden'], data['hasElevator'],
                     data['houseType apartment'], data['houseType ground_floor'], data['houseType half_basement'],
                     data['houseType roof_storey'], data['houseType maisonette'], data['houseType raised_ground_floor'],
                     data['houseType terraced_flat'], data['houseType other'], data['houseType penthouse'],
                     data['houseType loft'], data['bundesland Berlin'], data['bundesland Bremen'],
                     data['bundesland Nordrhein Westfalen'], data['bundesland Hamburg'], data['bundesland Sachsen Anhalt'],
                     data['bundesland Niedersachsen'], data['bundesland Baden Wuerttemberg'], data['bundesland Rheinland Pfalz'],
                     data['bundesland Hessen'], data['bundesland Brandenburg'], data['bundesland Sachsen'],
                     data['bundesland Thueringen'], data['bundesland Bayern'], data['bundesland Mecklenburg Vorpommern'],
                     data['bundesland Schleswig Holstein'], data['bundesland Saarland'], data['city_avr_rating'],
                     data['city_avr_acc_population_change'], data['city_avr_population_change_last_year'],
                     data['city_avr_persons_per_km2'], data['closest_city_distance']]
                    for data in training_data])
y_train = np.array([data['rent'] for data in training_data])

# Lineare Regression erstellen und trainieren
regression = LinearRegression()
regression.fit(X_train, y_train)

# Testdaten einlesen
with open('test_data.json', 'r') as file:
    test_data = json.load(file)

# Testdaten in separate Arrays konvertieren
X_test = np.array([[data['roomCount'], data['propertyAge'], data['livingSpace'], data['hasBasement'],
                    data['hasBalcony'], data['parkingLotCount'], data['hasGarden'], data['hasElevator'],
                    data['houseType apartment'], data['houseType ground_floor'], data['houseType half_basement'],
                    data['houseType roof_storey'], data['houseType maisonette'], data['houseType raised_ground_floor'],
                    data['houseType terraced_flat'], data['houseType other'], data['houseType penthouse'],
                    data['houseType loft'], data['bundesland Berlin'], data['bundesland Bremen'],
                    data['bundesland Nordrhein Westfalen'], data['bundesland Hamburg'], data['bundesland Sachsen Anhalt'],
                    data['bundesland Niedersachsen'], data['bundesland Baden Wuerttemberg'], data['bundesland Rheinland Pfalz'],
                    data['bundesland Hessen'], data['bundesland Brandenburg'], data['bundesland Sachsen'],
                    data['bundesland Thueringen'], data['bundesland Bayern'], data['bundesland Mecklenburg Vorpommern'],
                    data['bundesland Schleswig Holstein'], data['bundesland Saarland'], data['city_avr_rating'],
                    data['city_avr_acc_population_change'], data['city_avr_population_change_last_year'],
                    data['city_avr_persons_per_km2'], data['closest_city_distance']]
                   for data in test_data])

# Vorhersagen für die Testdaten machen
predictions = regression.predict(X_test)

# Ausgabe der Vorhersagen
for i, prediction in enumerate(predictions):
    print(f"Vorhersage für Datenpunkt {i+1}: {prediction}")

ModuleNotFoundError: No module named 'pyspark'