## Reddit Crawler

## Config && praw 

In [167]:
import configparser
import praw
import pandas as pd
import datetime

In [168]:
config = configparser.ConfigParser()
config.read("config.properties")
client = config["reddit"]["reddit.client"]
client_secret = config["reddit"]["reddit.secret"]
user_agent = config["reddit"]["reddit.user_agent"]

red = praw.Reddit(client_id=client, client_secret=client_secret, user_agent=user_agent)

# hot_posts = red.subreddit("europe").hot(limit=3)
# for post in hot_posts:
#     print(post.id)
#     print(post.title)
#     print(post.url)
#     print(post.selftext)


## Aufgabe 1.1

In [169]:
def get_subreddit_posts(subreddit, stype, slimit):
    posts = []

    if slimit > 10:
        return "A maximum of 10 posts may be gathered: " + str(slimit)

    if stype == "hot":
        hposts = red.subreddit(subreddit).hot(limit=slimit)

        for hpost in hposts:
            id = hpost.id
            title = hpost.title
            text = hpost.selftext
            posts.append([id, title, text])
    elif stype == "new":
        nposts = red.subreddit(subreddit).new(limit=slimit)

        for npost in nposts:
            id = npost.id
            title = npost.title
            text = npost.selftext
            posts.append([id, title, text])
    else:
        return "Please enter a valid subreddit type such as hot, new"

    return pd.DataFrame(posts, columns=["subreddit_id", "subreddit_title", "subreddit_text"])

In [170]:
print(get_subreddit_posts("europe", "hot", 11))
print(get_subreddit_posts("europe", "w", 6))

A maximum of 10 posts may be gathered: 11
Please enter a valid subreddit type such as hot, new


In [171]:
print(get_subreddit_posts("europe", "hot", 3))

  subreddit_id                                    subreddit_title  \
0      1bkysju                 War in Ukraine Megathread LVI (57)   
1      1as6y20    Moratorium on posts related to Israel-Palestine   
2      1c5bk6k  Fire engulfs Copenhagen’s Old Stock Exchange i...   

                                      subreddit_text  
0  \nThis megathread is meant for discussion of t...  
1  r/europe is the prime subreddit to share and d...  
2                                                     


## Aufgabe 1.2

In [172]:
bundesliga_df = get_subreddit_posts("Bundesliga", "hot", 10)
bundesliga_df.to_csv("bundesliga.csv", index=False)

## Aufgabe 2
## Aufgabe 2.1

In [173]:
def get_subreddit_posts_comments(subreddit, stype, slimit):
    all_comments = []
    posts_df = get_subreddit_posts(subreddit, stype, slimit)
    
    for _, post in posts_df.iterrows():
        post_id = post["subreddit_id"]

        post_obj = red.submission(id=post_id)
        post_obj.comments.replace_more(limit=None)
        comments = post_obj.comments.list()

        for comment in comments:
            subreddit_id = post["subreddit_id"]
            s_date = post_obj.created_utc
            comment_id = comment.id
            c_text = comment.body
            c_upvote = comment.score
            c_author = comment.author
            c_date = comment.created_utc

            all_comments.append([subreddit_id, s_date, comment_id, c_text, c_upvote, c_author, c_date])

    df = pd.DataFrame(all_comments, columns=["subreddit_id", "s_date", "comment_id", "c_text", "c_upvote", "c_author", "c_date"])
    
    return df

In [174]:
print(get_subreddit_posts_comments("Nachrichten", "hot", 10))

    subreddit_id        s_date comment_id  \
0        1977olb  1.705322e+09    kiqjz7n   
1        1c4yquf  1.713218e+09    kzsbfrv   
2        1c4qcrg  1.713198e+09    kzpp65o   
3        1c4qcrg  1.713198e+09    kzpde3u   
4        1c4qcrg  1.713198e+09    kzpowq8   
..           ...           ...        ...   
138      1c3vrv5  1.713107e+09    kznc4s8   
139      1c3vrv5  1.713107e+09    kzncx8b   
140      1c3vrv5  1.713107e+09    kzoed5u   
141      1c3vrv5  1.713107e+09    kzoj672   
142      1c2k6gt  1.712956e+09    kzcgb01   

                                                c_text  c_upvote  \
0                                            Gerne PN!         1   
1    Ich dachte alle fahren in der 10.\n\nDas war n...        54   
2    Mehrheit für mehr Geld für Klimaschutz, Bundes...        27   
3                                                  gut         8   
4    Oder auch: Mehrheit glaubt, dass mehr Geld die...         2   
..                                                 

## Aufgabe 2.2

In [175]:
def convert_utc_to_datetime(utc_date):    
    return datetime.date.fromtimestamp(utc_date)

## Aufgabe 2.3

In [176]:
df_datetime = get_subreddit_posts_comments("Nachrichten", "hot", 10)

df_datetime["s_date"] = df_datetime["s_date"].apply(convert_utc_to_datetime)
df_datetime["c_date"] = df_datetime["c_date"].apply(convert_utc_to_datetime)

print(df_datetime)

    subreddit_id      s_date comment_id  \
0        1977olb  2024-01-15    kiqjz7n   
1        1c4yquf  2024-04-15    kzsbfrv   
2        1c4qcrg  2024-04-15    kzpp65o   
3        1c4qcrg  2024-04-15    kzpde3u   
4        1c4qcrg  2024-04-15    kzpowq8   
..           ...         ...        ...   
138      1c3vrv5  2024-04-14    kznc4s8   
139      1c3vrv5  2024-04-14    kzncx8b   
140      1c3vrv5  2024-04-14    kzoed5u   
141      1c3vrv5  2024-04-14    kzoj672   
142      1c2k6gt  2024-04-12    kzcgb01   

                                                c_text  c_upvote  \
0                                            Gerne PN!         1   
1    Ich dachte alle fahren in der 10.\n\nDas war n...        53   
2    Mehrheit für mehr Geld für Klimaschutz, Bundes...        24   
3                                                  gut         8   
4    Oder auch: Mehrheit glaubt, dass mehr Geld die...         3   
..                                                 ...       ...   
138  Ic

## Aufgabe 2.4

In [177]:
df_datetime.to_csv("df_datetime.csv", index=True)

## Aufgabe 3

## Aufgabe 4