In [1]:
import os
import gzip
import json
import pandas as pd

In [2]:
def load_json(file_path):
    with open(file_path, 'r') as json_file:
        data = json.load(json_file)['news']
        df_temp = pd.DataFrame.from_records(data)
        return df_temp

def load_gzip(file_path):
    with gzip.open(file_path, 'rt') as gzipped_json_file:
        data = json.load(gzipped_json_file)['news']
        df_temp = pd.DataFrame.from_records(data)
        return df_temp

In [3]:
# Opens all JSON or gzipped JSON files in the specified folder and packs them into a list for later use.
def data_to_dataframe(folder_path='News'):
    list_of_data_frames = []
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        if filename.endswith('.json'):
            list_of_data_frames.append(load_json(file_path))
                
        elif filename.endswith('.json.gz'):
            list_of_data_frames.append(load_gzip(file_path))
                
    print('Process finished')
    return pd.concat(list_of_data_frames, ignore_index=True)

In [4]:
df = data_to_dataframe()

Process finished


In [5]:
df_dropped = df.drop_duplicates(subset = 'sophoraId', keep = 'last')

In [6]:
pd.set_option('display.max_columns', None)

In [7]:
df['date'] = pd.to_datetime(df['date'])

In [8]:
columns_to_remove = ['updateCheckUrl', 'details', 'detailsweb', 'shareURL', 'streams', 'comments']
df = df.drop(columns=columns_to_remove) 

Unnamed: 0,sophoraId,externalId,title,date,teaserImage,tags,tracking,topline,firstSentence,brandingImage,geotags,regionId,regionIds,type,breakingNews,alttext,copyright,ressort


In [32]:
df.head(10)

Unnamed: 0,sophoraId,externalId,title,date,teaserImage,tags,tracking,topline,firstSentence,brandingImage,geotags,regionId,regionIds,type,breakingNews,alttext,copyright,ressort
0,ndr-36-jaehriger-schwimmer-tot-aus-kiessee-bei...,tagesschau_fm-story-id-4ed766b3-779f-44f7-9faa...,36-jähriger Schwimmer tot aus Kiessee bei Frie...,2023-10-22 19:52:00+02:00,{'title': 'Mit Blaulicht und Martinshorn soll ...,[{'tag': 'Niedersachsen'}],[{'sid': 'app.inland.regional.niedersachsen.nd...,Niedersachsen,"Taucher fanden den Mann rund eine Stunde, nach...","{'title': 'Logo NDR', 'copyright': 'tagesschau...",[],9.0,[9],story,False,,,
1,wettervorhersage-europa-100,a13811c2-a633-446d-8c44-f854dbdb9233,Wetterlage und Temperaturen,2023-10-22 19:46:52.550000+02:00,"{'copyright': 'ARD', 'alttext': 'Europa Wetter...",[{'tag': 'Wetter'}],[{'sid': 'app.wetter.europa-welt.wettervorhers...,Wettervorhersage Europa,,,[],0.0,[],story,False,,,
2,hr-buergermeisterwahlen-ein-amtsinhaber-muss-g...,tagesschau_fm-story-id-902744f0-b542-4f4c-8098...,Bürgermeisterwahlen: Ein Amtsinhaber muss gehe...,2023-10-22 19:44:54.726000+02:00,"{'copyright': 'picture-alliance/dpa (Archiv)',...",[{'tag': 'Hessen'}],[{'sid': 'app.inland.regional.hessen.hr-buerge...,Hessen,In vier hessischen Kommunen fanden am Sonntag ...,"{'title': 'Logo HR', 'copyright': 'tagesschau,...",[],7.0,[7],story,False,,,
3,sr-schweres-los-fuer-gabriel-clemens-bei-darts...,tagesschau_fm-story-gabriel_clemens_bei_darts_...,Schweres Los für Gabriel Clemens bei Darts-EM,2023-10-22 19:34:00+02:00,"{'title': 'Gabriel Clemens', 'copyright': 'IMA...",[{'tag': 'Saarland'}],[{'sid': 'app.inland.regional.saarland.sr-schw...,Saarland,Ab nächsten Donnerstag tritt der Honzrather Da...,"{'title': 'Logo SR', 'copyright': 'tagesschau,...",[],12.0,[12],story,False,,,
4,ndr-verbot-pro-palaestinensischer-demos-in-ham...,tagesschau_fm-story-id-da32f992-bc30-4f21-ba91...,Verbot pro-palästinensischer Demos in Hamburg ...,2023-10-22 19:30:00+02:00,{'title': 'Ein junger Mann trägt bei eine palä...,[{'tag': 'Hamburg'}],[{'sid': 'app.inland.regional.hamburg.ndr-verb...,Hamburg,Pro-palästinensische Kundgebungen sind nun in ...,"{'title': 'Logo NDR', 'copyright': 'tagesschau...",[],6.0,[6],story,False,,,
5,video-1263800,029950e1-8537-4217-9423-71f76d192795,Aktueller Wolkenfilm,2023-10-22 19:16:08.914000+02:00,"{'title': 'Sendungsbild', 'copyright': 'ARD-ak...",[],"[{'sid': 'app.multimedia.video.video-1263800',...",,,,,,,video,,Sendungsbild,tagesschau,
6,liveblog-israel-sonntag-102,649e5988-da34-43c4-8c22-71da5a1fb805,++ Netanyahu warnt Hisbollah ++,2023-10-22 19:14:25.652000+02:00,"{'alttext': 'Benjamin Netanyahu', 'imageVarian...","[{'tag': 'Israel'}, {'tag': 'Liveblog'}]",[{'sid': 'app.newsticker.liveblog-israel-sonnt...,Nach Großangriff auf Israel,Die aktuellen Entwicklungen im Nahen Osten in ...,,[],0.0,[],story,False,,,
7,hr--promis-auf-der-buchmesse--tschechien-ist-b...,tagesschau_fm-story-id-55097a32-3306-48ef-b38b...,+++ Rund 215.000 Besucher kommen nach Frankfur...,2023-10-22 19:11:36.030000+02:00,{'title': 'Auch vor Halle 3 war es am Samstag ...,[{'tag': 'Hessen'}],[{'sid': 'app.inland.regional.hessen.hr--promi...,Hessen,"Die Frankfurter Buchmesse 2023. Service, Progr...","{'title': 'Logo HR', 'copyright': 'tagesschau,...",[],7.0,[7],story,False,,,
8,rbb-alba-siegt-trotz-katastrophalem-start-gege...,tagesschau_fm-story-rbb_basketball-bundesliga-...,Alba siegt trotz katastrophalem Start gegen Ve...,2023-10-22 19:10:00+02:00,"{'title': 'Alba Berlin', 'copyright': 'imago i...",[{'tag': 'Berlin'}],[{'sid': 'app.inland.regional.berlin.rbb-alba-...,Berlin,,"{'title': 'Logo rbb', 'copyright': 'tagesschau...",[],3.0,[3],story,False,,,
9,parlamentswahl-schweiz-rechtsruck-100,909e7513-70e6-4605-b488-cb39b9d33425,Schweiz rückt weiter nach rechts,2023-10-22 19:07:37.212000+02:00,{'alttext': 'Marcel Dettling gibt der ARD ein ...,"[{'tag': 'Schweiz'}, {'tag': 'Parlamentswahl'}]",[{'sid': 'app.ausland.europa.parlamentswahl-sc...,Parlamentswahl,Die SVP ist in der Schweiz seit vielen Jahren ...,,[],0.0,[],story,False,,,ausland


In [28]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [30]:
column_data = df['teaserImage']
column_data.head(3)

0    {'title': 'Mit Blaulicht und Martinshorn soll ...
1    {'copyright': 'ARD', 'alttext': 'Europa Wetter...
2    {'copyright': 'picture-alliance/dpa (Archiv)',...
Name: teaserImage, dtype: object

In [31]:
column_data = df['tracking']
column_data.head(3)

0    [{'sid': 'app.inland.regional.niedersachsen.nd...
1    [{'sid': 'app.wetter.europa-welt.wettervorhers...
2    [{'sid': 'app.inland.regional.hessen.hr-buerge...
Name: tracking, dtype: object

In [33]:
def list_tags(line):
    #tmp = []
    #for x in line:
        #tmp.append(x['tag'])
    #return tmp
    # or just
    return [x['tag'] for x in line]

In [45]:
df['tags'] = df['tags'].apply(list_tags)

In [47]:
df.head(3)

Unnamed: 0,sophoraId,externalId,title,date,teaserImage,tags,tracking,topline,firstSentence,brandingImage,geotags,regionId,regionIds,type,breakingNews,alttext,copyright,ressort
0,ndr-36-jaehriger-schwimmer-tot-aus-kiessee-bei...,tagesschau_fm-story-id-4ed766b3-779f-44f7-9faa...,36-jähriger Schwimmer tot aus Kiessee bei Frie...,2023-10-22 19:52:00+02:00,{'title': 'Mit Blaulicht und Martinshorn soll ...,[Niedersachsen],[{'sid': 'app.inland.regional.niedersachsen.nd...,Niedersachsen,"Taucher fanden den Mann rund eine Stunde, nach...","{'title': 'Logo NDR', 'copyright': 'tagesschau...",[],9.0,[9],story,False,,,
1,wettervorhersage-europa-100,a13811c2-a633-446d-8c44-f854dbdb9233,Wetterlage und Temperaturen,2023-10-22 19:46:52.550000+02:00,"{'copyright': 'ARD', 'alttext': 'Europa Wetter...",[Wetter],[{'sid': 'app.wetter.europa-welt.wettervorhers...,Wettervorhersage Europa,,,[],0.0,[],story,False,,,
2,hr-buergermeisterwahlen-ein-amtsinhaber-muss-g...,tagesschau_fm-story-id-902744f0-b542-4f4c-8098...,Bürgermeisterwahlen: Ein Amtsinhaber muss gehe...,2023-10-22 19:44:54.726000+02:00,"{'copyright': 'picture-alliance/dpa (Archiv)',...",[Hessen],[{'sid': 'app.inland.regional.hessen.hr-buerge...,Hessen,In vier hessischen Kommunen fanden am Sonntag ...,"{'title': 'Logo HR', 'copyright': 'tagesschau,...",[],7.0,[7],story,False,,,


# Trying to join things together ,but failing :D


In [5]:
def load_data(file_path):
    
    with open(file_path, 'r') as file:
        
        if file_path.endswith('.json.gz'):
            json_data = json.load(gzip.open(file, 'rt'))
            
        elif file_path.endswith('.json'):
            json_data = json.load(file)
            
        else:
            raise Exception("Filetype not recognized") 
            print("LOAD DATE FAILED") #When you aren't good at debugging and need to ask people

        df_temp = pd.DataFrame.from_records(json_data['news'])
        return df_temp

In [6]:
# Opens all JSON or gzipped JSON files in the specified folder and packs them into a list for later use.
def data_to_dataframe(folder_path='News'):
    list_of_data_frames = []
    
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        
        load_data(file_path)
                
    print('Process finished')
    return pd.concat(list_of_data_frames, ignore_index=True)