# MajorityVote

0. Import Necessary Packages
1. Import Check list
2. Import and Process Subreddit Files
3. Label for Duplicated Data and Separate fron Unique Data
4. Apply Majority Vote for"out-of-State" Data.

## 0. Import Necessary Packages

In [1]:
import pandas as pd
import glob
import os
import random
import numpy as np

## 1. Import Check list

In [2]:
refer_df = pd.read_csv('ReferListTest.csv')
refer_df['Subreddit'] = refer_df['Subreddit'].str.lower()
refer_df['Place_Name'] = refer_df['Place_Name'].str.lower()

refer_df['City'] = pd.to_numeric(refer_df['City'], errors='coerce')
refer_df['District'] = pd.to_numeric(refer_df['District'], errors='coerce')
refer_df['State'] = pd.to_numeric(refer_df['State'], errors='coerce')

#generate refer list for states only
state_refer_df = refer_df[refer_df['State'] == 1]
print(state_refer_df)

#generate refer list for cities and districts
refer_df = refer_df[(refer_df['City'] == 1) | (refer_df['District'] == 1)]
print(refer_df)

  Subreddit  City County  State  District   Place_Name State_Name
4        s1   0.0      0    1.0       0.0  test state1         S1
5        s2   0.0      0    1.0       0.0  test state2         S2
   Subreddit  City County  State  District  Place_Name State_Name
0  testcity1   1.0      0    0.0       0.0  test city1         S1
1  testcity0   1.0      0    0.0       0.0  test city0         S1
2  testcity2   1.0      0    0.0       0.0  test city2         S2
3  testcity3   1.0      0    0.0       0.0  test city3         S2
6  district0   0.0      0    0.0       1.0  test city1         S1


## 2. Import and Process Subreddit Files

In [3]:
#Initialize empty variables.
df_list = []
max_posts_df = pd.DataFrame()

#Process city and district files one by one.
for filename in glob.glob('*_posts.csv'):
    df = pd.read_csv(filename)
    
    #Skip for files that are empty.
    if df.iloc[0:].empty:
        print(f"Empty in {filename}. Skipping...")
        continue
    
    subreddit_city = os.path.splitext(filename)[0].replace('_posts', '').lower()
    
    #Get City's info from refer list.
    matching_rows = refer_df[refer_df['Subreddit'] == subreddit_city]
    
    #Skip if this city no found. 
    if matching_rows.empty:
        print(f"No matching subreddit found for {subreddit_city}. Skipping...")
        continue
    
    #Skip for city that labeled null on refer list.
    row = matching_rows.iloc[0]
    if row.isin(['null']).any():
        print(f"Skipping {subreddit_city} due to null values.")
        continue
        
    df['n_posts'] = pd.to_numeric(df['n_posts'], errors='coerce')
    
    #Group by author to ensure all authors are unique
    grouped_df = df.groupby('author')['n_posts'].sum().reset_index()
    
    #Add geographical info.
    grouped_df['Place_Name'] = row['Place_Name']
    grouped_df['State_Name'] = row['State_Name']
    
    #Append to df list for futher concate
    df_list.append(grouped_df)

#Concate all df with unique authors
max_posts_df = pd.concat(df_list)
max_posts_df['n_posts'] = max_posts_df['n_posts'].astype(int)

#Group by author to sum their posts with unique geographical info
max_posts_df = max_posts_df.groupby(['author',
                                     'Place_Name',
                                     'State_Name'])['n_posts'].sum().reset_index()

#Select rows with max posts for author with unique geographical 
#Using .transform() to keep all equal max posts rows.
max_posts_df = max_posts_df[max_posts_df.groupby('author')['n_posts']
                            .transform(max) == max_posts_df['n_posts']]
max_posts_df

No matching subreddit found for s1. Skipping...
No matching subreddit found for s2. Skipping...
No matching subreddit found for teabag. Skipping...
Empty in TestCity3_posts.csv. Skipping...


Unnamed: 0,author,Place_Name,State_Name,n_posts
0,IhateS2,test city1,S1,99
1,IhateS2,test city2,S2,99
2,IliveInS1,test city1,S1,100
4,IliveInS1&neverlft,test city0,S1,99
5,IliveInS1&neverlft,test city1,S1,99
6,IliveInS2,test city1,S1,99
7,IliveInS2,test city2,S2,99
8,random1,test city0,S1,99
9,random2,test city0,S1,99
10,random3,test city0,S1,99


## 3. Label for Duplicated Data and Separate fron Unique Data

In [4]:
def determine_equal_amount(row):
    if row['duplicated_author']:
        author_rows = max_posts_df[max_posts_df['author'] == row['author']]
        if author_rows.duplicated('State_Name').any():
            return 'intra-state'
        else:
            return 'out-of-state'
    else:
        return None

#Extract unique rows after first process
unique_authors_df = max_posts_df.drop_duplicates('author', keep=False).copy()
print(f"{len(unique_authors_df.author)} unique authors after first process")

#Label duplicated rows after first process in a temporary col
max_posts_df['duplicated_author'] = max_posts_df.duplicated('author', keep=False)

#Add Moves column to indicate their statue
#out-of-state: posted in two or more state
#intra-state: posted within one state only
max_posts_df['Moves'] = max_posts_df.apply(determine_equal_amount, axis=1)
print(max_posts_df)
#Remove temporary row after process
max_posts_df.drop(columns=['duplicated_author'], inplace=True)

#Extract duplicated rows after labeling
duplicated_authors_df = max_posts_df[max_posts_df.duplicated('author', keep=False)].copy()
print("Duplicated Authors Dataframe:")
print(duplicated_authors_df)

print("Unique Authors Dataframe:")
print(unique_authors_df)

4 unique authors after first process
                author  Place_Name State_Name  n_posts  duplicated_author  \
0              IhateS2  test city1         S1       99               True   
1              IhateS2  test city2         S2       99               True   
2            IliveInS1  test city1         S1      100              False   
4   IliveInS1&neverlft  test city0         S1       99               True   
5   IliveInS1&neverlft  test city1         S1       99               True   
6            IliveInS2  test city1         S1       99               True   
7            IliveInS2  test city2         S2       99               True   
8              random1  test city0         S1       99              False   
9              random2  test city0         S1       99              False   
10             random3  test city0         S1       99              False   

           Moves  
0   out-of-state  
1   out-of-state  
2           None  
4    intra-state  
5    intra-state  
6

## 4. Apply Majority Vote for"out-of-State" Data.

## rows_to_move = []

#Select rows labeled 'out-of-state'
out_state_rows = duplicated_authors_df[duplicated_authors_df['Moves'] == 'out-of-state']

#Map the coresponding subreddit/file name to this list for futher uses.
state_to_subreddit = state_refer_df.set_index('State_Name')['Subreddit'].to_dict()
out_state_rows['Subreddit'] = out_state_rows['State_Name'].map(state_to_subreddit)

#Proces each unique author
for author in out_state_rows['author'].unique():
    author_state_df = out_state_rows[out_state_rows['author'] == author]
     
    #Get their unique state lists
    state_names = author_state_df['State_Name'].unique()
    matching_rows = []
    
    #Proces each unique state within unique author to get all state level files. 
    for state_name in state_names:
        subreddit = state_refer_df[state_refer_df['State_Name'] == state_name]['Subreddit'] 

        filename = f"{subreddit.values[0]}_posts.csv"
        df = pd.read_csv(filename)
        
        #Extract the author we need only from state files.
        author_rows = df[df['author'] == author]
        
        #Added to a list
        if not author_rows.empty:
            matching_rows.append(author_rows)
    
    #if this list exist (not empty)
    if matching_rows:
        
        #Concate all dfs with this author from state file
        concatenated_df = pd.concat(matching_rows)
        
        #For only one state found
        if concatenated_df.shape[0] == 1:
            row = concatenated_df.iloc[0]
            
            #Match the row from state file with the coresponding row in duplicated df using unique author and subreddit info
            row_to_keep = out_state_rows[(out_state_rows['Subreddit'] == row['subreddit'].lower()) & 
                                         (out_state_rows['author'] == row['author'])]
            #Add to a list for futher concate
            rows_to_move += [row_to_keep]
            
        #For more than one state found
        else:
            
            #Find with state this author posted most
            max_post_row = concatenated_df.loc[concatenated_df['n_posts'].idxmax()]
            
            #Match the row from state file with the coresponding row in duplicated df using unique author and subreddit info
            max_row_to_keep = out_state_rows[(out_state_rows['Subreddit'] == max_post_row['subreddit'].lower()) & 
                                         (out_state_rows['author'] == max_post_row['author'])]
            
            #Add to a list for futher concate
            rows_to_move += [max_row_to_keep]

#Concate all rows we want to keep
rows_to_move_df = pd.concat(rows_to_move)

#Remove the temporary column
rows_to_move_df.drop(['Subreddit'], axis=1, inplace=True)

#With all rows we want to keep:
for author in rows_to_move_df['author'].unique():
    
    #Remove them from the duplicated df since they are no longer duplicated
    duplicated_authors_df = duplicated_authors_df.drop(duplicated_authors_df[
        duplicated_authors_df['author'] == author].index)
    
#Drop the moves to fit unique df
rows_to_move_df.drop(['Moves'], axis=1, inplace=True)
unique_authors_df = pd.concat([unique_authors_df, rows_to_move_df])

intra_state = len(duplicated_authors_df[duplicated_authors_df["Moves"]=="intra-state"])
out_state = len(duplicated_authors_df[duplicated_authors_df["Moves"]=="out-of-state"])

print(f"{len(unique_authors_df.author)} unique authors after second process")
print(f"{int(intra_state/2)} intra-state authors after second process")
print(f"{int(out_state/2)} out-of-state authors after second process")

print("Unique Authors Dataframe:")
print(unique_authors_df)
print("Duplicated Authors Dataframe:")
print(duplicated_authors_df)    

In [6]:
unique_authors_df.to_csv('Uniq_MaxPosts.csv', index=False)
duplicated_authors_df.to_csv('Duplic_MaxPosts.csv', index=False)