In [64]:
## Reddit Crawler

## Config && praw 

In [65]:
import configparser
import praw
import pandas as pd
import datetime

In [66]:
config = configparser.ConfigParser()
config.read("config.properties")
client = config["reddit"]["reddit.client"]
client_secret = config["reddit"]["reddit.secret"]
user_agent = config["reddit"]["reddit.user_agent"]

red = praw.Reddit(client_id=client, client_secret=client_secret, user_agent=user_agent)

# hot_posts = red.subreddit("europe").hot(limit=3)
# for post in hot_posts:
#     print(post.id)
#     print(post.title)
#     print(post.url)
#     print(post.selftext)


## Aufgabe 1.1

In [67]:
def get_subreddit_posts(subreddit, stype, slimit):
    posts = []

    if slimit > 10:
        return "A maximum of 10 posts may be gathered: " + str(slimit)

    if stype == "hot":
        hposts = red.subreddit(subreddit).hot(limit=slimit)

        for hpost in hposts:
            id = hpost.id
            title = hpost.title
            text = hpost.selftext
            posts.append([id, title, text])
    elif stype == "new":
        nposts = red.subreddit(subreddit).new(limit=slimit)

        for npost in nposts:
            id = npost.id
            title = npost.title
            text = npost.selftext
            posts.append([id, title, text])
    else:
        return "Please enter a valid subreddit type such as hot, new"

    return pd.DataFrame(posts, columns=["subreddit_id", "subreddit_title", "subreddit_text"])

In [68]:
print(get_subreddit_posts("europe", "hot", 11))
print(get_subreddit_posts("europe", "w", 6))

A maximum of 10 posts may be gathered: 11
Please enter a valid subreddit type such as hot, new


In [69]:
print(get_subreddit_posts("europe", "hot", 3))

  subreddit_id                     subreddit_title  \
0      1bkysju  War in Ukraine Megathread LVI (57)   
1      1c63g6b     Croatian parliamentary election   
2      1c623ia              A protester in Tbilisi   

                                      subreddit_text  
0  \nThis megathread is meant for discussion of t...  
1  Today (April 17th) citizens of Croatia go to p...  
2                                                     


## Aufgabe 1.2

In [70]:
bundesliga_df = get_subreddit_posts("Bundesliga", "hot", 10)
bundesliga_df.to_csv("bundesliga.csv", index=False)

## Aufgabe 2
## Aufgabe 2.1

In [71]:
def get_subreddit_posts_comments(subreddit, stype, slimit):
    all_comments = []
    posts_df = get_subreddit_posts(subreddit, stype, slimit)

    for _, post in posts_df.iterrows():
        post_id = post["subreddit_id"]

        post_obj = red.submission(id=post_id)
        post_obj.comments.replace_more(limit=None)
        comments = post_obj.comments.list()

        for comment in comments:
            subreddit_id = post["subreddit_id"]
            s_date = post_obj.created_utc
            comment_id = comment.id
            c_text = comment.body
            c_upvote = comment.score
            c_author = comment.author
            c_date = comment.created_utc

            all_comments.append([subreddit_id, s_date, comment_id, c_text, c_upvote, c_author, c_date])

    df = pd.DataFrame(all_comments,
                      columns=["subreddit_id", "s_date", "comment_id", "c_text", "c_upvote", "c_author", "c_date"])

    return df

In [72]:
print(get_subreddit_posts_comments("Nachrichten", "hot", 10))

    subreddit_id        s_date comment_id  \
0        1977olb  1.705322e+09    kiqjz7n   
1        1c65xfk  1.713348e+09    kzyq8tl   
2        1c65xfk  1.713348e+09    kzz5fgv   
3        1c65xfk  1.713348e+09    kzytjdz   
4        1c65xfk  1.713348e+09    kzyr4mm   
..           ...           ...        ...   
174      1c3qxb5  1.713092e+09    kzop7or   
175      1c3qxb5  1.713092e+09    kzjfdzn   
176      1c3qxb5  1.713092e+09    kznhdmx   
177      1c3qxb5  1.713092e+09    kznm18i   
178      1c3qxb5  1.713092e+09    kznn5g1   

                                                c_text  c_upvote  \
0                                            Gerne PN!         1   
1    > **Behörden haben den ehemaligen griechischen...         9   
2    Ach in welcher schönen Zeit wir leben wo die g...         3   
3    Schade, dass das nich auch schon ginger also e...        -5   
4                                            [removed]       -30   
..                                                 

## Aufgabe 2.2

In [73]:
def convert_utc_to_datetime(utc_date):
    return datetime.date.fromtimestamp(utc_date)

## Aufgabe 2.3

In [74]:
df_datetime = get_subreddit_posts_comments("Nachrichten", "hot", 10)

df_datetime["s_date"] = df_datetime["s_date"].apply(convert_utc_to_datetime)
df_datetime["c_date"] = df_datetime["c_date"].apply(convert_utc_to_datetime)

print(df_datetime)

    subreddit_id      s_date comment_id  \
0        1977olb  2024-01-15    kiqjz7n   
1        1c65xfk  2024-04-17    kzyq8tl   
2        1c65xfk  2024-04-17    kzz5fgv   
3        1c65xfk  2024-04-17    kzytjdz   
4        1c65xfk  2024-04-17    kzyr4mm   
..           ...         ...        ...   
174      1c3qxb5  2024-04-14    kzop7or   
175      1c3qxb5  2024-04-14    kzjfdzn   
176      1c3qxb5  2024-04-14    kznhdmx   
177      1c3qxb5  2024-04-14    kznm18i   
178      1c3qxb5  2024-04-14    kznn5g1   

                                                c_text  c_upvote  \
0                                            Gerne PN!         1   
1    > **Behörden haben den ehemaligen griechischen...         7   
2    Ach in welcher schönen Zeit wir leben wo die g...         3   
3    Schade, dass das nich auch schon ginger also e...        -4   
4                                            [removed]       -30   
..                                                 ...       ...   
174    

## Aufgabe 2.4

In [75]:
df_datetime.to_csv("df_datetime.csv", index=True)

## Aufgabe 3

In [76]:
def calc_comments_length(df, col_name):
    df['n_letters'] = df[col_name].apply(len)
    return df


df = pd.read_csv("df_datetime.csv")

extended_df = calc_comments_length(df, 'c_text')
print(extended_df)

     Unnamed: 0 subreddit_id      s_date comment_id  \
0             0      1977olb  2024-01-15    kiqjz7n   
1             1      1c65xfk  2024-04-17    kzyq8tl   
2             2      1c65xfk  2024-04-17    kzz5fgv   
3             3      1c65xfk  2024-04-17    kzytjdz   
4             4      1c65xfk  2024-04-17    kzyr4mm   
..          ...          ...         ...        ...   
174         174      1c3qxb5  2024-04-14    kzop7or   
175         175      1c3qxb5  2024-04-14    kzjfdzn   
176         176      1c3qxb5  2024-04-14    kznhdmx   
177         177      1c3qxb5  2024-04-14    kznm18i   
178         178      1c3qxb5  2024-04-14    kznn5g1   

                                                c_text  c_upvote  \
0                                            Gerne PN!         1   
1    > **Behörden haben den ehemaligen griechischen...         7   
2    Ach in welcher schönen Zeit wir leben wo die g...         3   
3    Schade, dass das nich auch schon ginger also e...        -4   

## Aufgabe 4

In [77]:
def calculate_comment_activity(df, post_date_col, comment_date_col):
    df[post_date_col] = pd.to_datetime(df[post_date_col])
    df[comment_date_col] = pd.to_datetime(df[comment_date_col])
    
    df['days_since_post'] = (df[comment_date_col] - df[post_date_col]).dt.days

    comment_activity = df['days_since_post'].value_counts().sort_index()
    
    output_df = pd.DataFrame({'counts': comment_activity})
    output_df.index.name = 'days'

    return output_df

comment_activity = calculate_comment_activity(df, 's_date', 'c_date')
print(comment_activity)

      counts
days        
0         67
1        104
2          7
5          1
