In [1]:
import pandas as pd
import datetime
import os

In [2]:
def process_two(s_1, s_2):
    file_comment = pd.read_csv(s_1, encoding='ISO-8859-1')
    file_submission = pd.read_csv(s_2, encoding='ISO-8859-1')
    file_title_comment = s_1.split('_')[0].lower()
    
    if len(file_comment) == 0 and len(file_submission) == 0:
        return
    
    selected_cols = ['author', 'subreddit', 'subreddit_id']
    if 'utc_datetime_str' in file_comment.columns:
        selected_cols.append('utc_datetime_str')

    file_comment = file_comment[selected_cols].sort_values('author', ignore_index = True)
    file_submission = file_submission[selected_cols].sort_values('author', ignore_index = True)
    
    if 'utc_datetime_str' in file_comment.columns:
        x = datetime.datetime(2020, 11, 4)

        file_comment['utc_datetime_str'] = pd.to_datetime(file_comment['utc_datetime_str'], errors='coerce')
        file_submission['utc_datetime_str'] = pd.to_datetime(file_submission['utc_datetime_str'], errors='coerce')
    
        file_comment = file_comment[(file_comment['author'] != '[deleted]') & 
                                    (file_comment['utc_datetime_str'] < x) & 
                                    (file_comment['utc_datetime_str'].notna())]
        file_comment['subreddit'] = file_comment['subreddit'].apply(str.lower)
        file_comment = file_comment[file_comment['subreddit'] == file_title_comment]
        
        file_submission = file_submission[(file_submission['author'] != '[deleted]') & 
                                          (file_submission['utc_datetime_str'] < x) & 
                                          (file_submission['utc_datetime_str'].notna())]
        file_submission['subreddit'] = file_submission['subreddit'].apply(str.lower)
        file_submission = file_submission[file_submission['subreddit'] == file_title_comment]

    else:
        file_comment = file_comment[file_comment['author'] != '[deleted]']
        file_comment['subreddit'] = file_comment['subreddit'].apply(str.lower)
        file_comment = file_comment[file_comment['subreddit'] == file_title_comment]
        
        file_submission = file_submission[file_submission['author'] != '[deleted]']
        file_submission['subreddit'] = file_submission['subreddit'].apply(str.lower)
        file_submission = file_submission[file_submission['subreddit'] == file_title_comment]
        
    file_new = pd.concat([file_comment, file_submission])
    file_count = file_new.groupby('author').count().reset_index()
    
    file_output = file_count[['author', 'subreddit']]
    file_output = file_output.rename(columns = {'subreddit':'n_posts'})
    
    most_common_subreddit = file_new.groupby('author')['subreddit'].agg(pd.Series.mode).reset_index()
    most_common_subreddit_id = file_new.groupby('author')['subreddit_id'].agg(pd.Series.mode).reset_index()

    file_output = pd.merge(file_output, most_common_subreddit, on='author')
    file_output = pd.merge(file_output, most_common_subreddit_id, on='author')
    name = '_'.join(s_1.split('_')[:-1]) + '_posts.csv'
    file_output.to_csv(name, index = False)

In [3]:
def process_one(s_1):
    file = pd.read_csv(s_1, encoding='ISO-8859-1')
    file_title_comment = s_1.split('_')[0]
    
    if len(file) == 0:
        return
    
    selected_cols = ['author', 'subreddit', 'subreddit_id']
    if 'utc_datetime_str' in file.columns:
        selected_cols.append('utc_datetime_str')
        
    file = file[selected_cols].sort_values('author', ignore_index = True)
    
    if 'utc_datetime_str' in file.columns:
        x = datetime.datetime(2020, 11, 4)

        file['utc_datetime_str'] = pd.to_datetime(file['utc_datetime_str'], errors='coerce')
        
        file = file[(file['author'] != '[deleted]') & 
                    (file['utc_datetime_str'] < x) & 
                    (file['utc_datetime_str'].notna())]
        file = file[file['subreddit'] == file_title_comment]
    else:
        file = file[file['author'] != '[deleted]']
        file = file[file['subreddit'] == file_title_comment]
        
    file_count = file.groupby('author').count().reset_index()
    
    file_output = file_count[['author', 'subreddit']]
    file_output = file_output.rename(columns = {'subreddit':'n_posts'})
    
    most_common_subreddit = file.groupby('author')['subreddit'].agg(pd.Series.mode).reset_index()
    most_common_subreddit_id = file.groupby('author')['subreddit_id'].agg(pd.Series.mode).reset_index()

    file_output = pd.merge(file_output, most_common_subreddit, on='author')
    file_output = pd.merge(file_output, most_common_subreddit_id, on='author')
    
    name = '_'.join(s_1.split('_')[:-1]) + '_posts.csv'
    file_output.to_csv(name, index = False)

In [4]:
#Get all csv files names
current_directory = os.getcwd()

files = []
for i in os.listdir(current_directory):
    if i.endswith('comments.csv') or i.endswith('submissions.csv'):
        files.append(i)
        
file_names = pd.DataFrame({'file_name':files})
starting = ['_'.join(i.split('_')[:-1]) for i in file_names['file_name']]
file_names['starting'] = starting
file_names = file_names.sort_values('starting', ignore_index = True)
file_names

Unnamed: 0,file_name,starting
0,QuayCounty_comments.csv,QuayCounty
1,QuayCounty_submissions.csv,QuayCounty
2,QueenAnnesCounty_comments.csv,QueenAnnesCounty
3,QueenAnnesCounty_submissions.csv,QueenAnnesCounty
4,Queensbury_comments.csv,Queensbury
...,...,...
227,rutlandvt_submissions.csv,rutlandvt
228,rva_comments.csv,rva
229,rva_submissions.csv,rva
230,ryeny_comments.csv,ryeny


In [5]:
#Process all csv files
even = 0
odd = 1
while even < len(file_names) and odd < len(file_names):
    s_1 = file_names.loc[even, 'file_name']
    s_2 = file_names.loc[odd, 'file_name']
    
    #Check if one area has both comments and submissions
    #If not, skip
    if '_'.join(s_1.split('_')[:-1]) == '_'.join(s_2.split('_')[:-1]):
        process_two(s_1, s_2)
        even += 2
        odd += 2
    else:
        process_one(s_1)
#         print(s_1)
        even += 1
        odd += 1

  file_comment = pd.read_csv(s_1, encoding='ISO-8859-1')
  file_submission = pd.read_csv(s_2, encoding='ISO-8859-1')
  file_submission = pd.read_csv(s_2, encoding='ISO-8859-1')
  file_comment = pd.read_csv(s_1, encoding='ISO-8859-1')
  file_submission = pd.read_csv(s_2, encoding='ISO-8859-1')


In [6]:
#Remove submission and comments files
for i in file_names['file_name']:
    os.remove(i)