# Transformations
This file will create a Data Frame that is optimized for visualizations in Tableau. This Data Frame will group by the factuality of different types of politically leaning Facebook pages.

In [1]:
import pandas as pd


buzzfeed_df = pd.read_csv('csv_collection/cleaned_buzzfeed_data.csv')

display(buzzfeed_df.head(3))

Unnamed: 0,account_id,post_id,category,page,post_url,date_published,post_type,rating,debate,share_count,reaction_count,comment_count,engagement_score
0,184096565021911,1035057923259100,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,video,no factual content,,0.0,146.0,15.0,44.0
1,184096565021911,1035269309904628,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,link,mostly true,,1.0,33.0,34.0,26.25
2,184096565021911,1035305953234297,mainstream,ABC News Politics,https://www.facebook.com/ABCNewsPolitics/posts...,2016-09-19,link,mostly true,,34.0,63.0,27.0,63.25


In [2]:
# Creating the dataframe:

tableau_df_prep = (
    buzzfeed_df.groupby(['category', 'rating']).agg(
        count=('rating', 'size'),
        total_engagement=('engagement_score', 'sum'),
        average_engagement=('engagement_score', 'mean'),
        share_count = ('share_count', 'sum'),
        reaction_count = ('reaction_count', 'sum'),
        comment_count = ('comment_count', 'sum')
    ).reset_index())

# Filtering out no factual content:
tableau_df_prep = tableau_df_prep[tableau_df_prep['rating'].isin(['mixture of true and false', 'mostly false', 'mostly true'])]

# Creating the rating order:
rating_order = ["mostly true", "mixture of true and false", "mostly false"]
tableau_df_prep['rating'] = pd.Categorical(tableau_df_prep['rating'], categories=rating_order, ordered=True)
tableau_df_prep['rating_num'] = tableau_df_prep['rating'].cat.codes

display(tableau_df_prep.head(19))

Unnamed: 0,category,rating,count,total_engagement,average_engagement,share_count,reaction_count,comment_count,rating_num
0,left,mixture of true and false,68,1327089.5,19516.022059,979944.0,1153774.0,117404.0,1
1,left,mostly false,22,279340.5,12697.295455,177355.0,360502.0,23720.0,2
2,left,mostly true,265,3376622.0,12741.969811,2313384.0,3746328.0,253312.0,0
4,mainstream,mixture of true and false,8,29589.75,3698.71875,15758.0,46953.0,4187.0,1
5,mainstream,mostly true,1085,451457.25,416.089631,157798.0,728327.0,223155.0,0
7,right,mixture of true and false,169,341221.0,2019.059172,229518.0,322282.0,62265.0,1
8,right,mostly false,82,268026.5,3268.615854,190386.0,252698.0,28932.0,2
9,right,mostly true,319,420293.0,1317.532915,265645.0,466888.0,75852.0,0


In [3]:
# Creating more obvious descriptors for columns:

tableau_df = tableau_df_prep.copy().rename(columns={
    'category': 'publication_type',
    'rating': 'factuality_type'
})

In [4]:
# Sorting the dataframe in order:
tableau_df = tableau_df.sort_values(by=['publication_type', 'rating_num'], ascending=True)

# Creating a percent column for number of posts, to show what type of posts happen at what rate:
def percent_calculator(df, count_col='count', new_col='percent_of_posts'):
    total = df[count_col].sum()
    df[new_col] = (df[count_col] / total) * 100
    return df


tableau_df_left = tableau_df[tableau_df['publication_type'] == 'left']
tableau_df_mainstream = tableau_df[tableau_df['publication_type'] == 'mainstream']
tableau_df_right = tableau_df[tableau_df['publication_type'] == 'right']

tableau_df_left = percent_calculator(tableau_df_left)
tableau_df_mainstream = percent_calculator(tableau_df_mainstream)
tableau_df_right = percent_calculator(tableau_df_right)


# Putting the dataframes back together:
tableau_df = pd.concat(
    [tableau_df_left, tableau_df_mainstream, tableau_df_right],
    axis=0,
    ignore_index=True
)

display(tableau_df.head(10))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = (df[count_col] / total) * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = (df[count_col] / total) * 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[new_col] = (df[count_col] / total) * 100


Unnamed: 0,publication_type,factuality_type,count,total_engagement,average_engagement,share_count,reaction_count,comment_count,rating_num,percent_of_posts
0,left,mostly true,265,3376622.0,12741.969811,2313384.0,3746328.0,253312.0,0,74.647887
1,left,mixture of true and false,68,1327089.5,19516.022059,979944.0,1153774.0,117404.0,1,19.15493
2,left,mostly false,22,279340.5,12697.295455,177355.0,360502.0,23720.0,2,6.197183
3,mainstream,mostly true,1085,451457.25,416.089631,157798.0,728327.0,223155.0,0,99.26807
4,mainstream,mixture of true and false,8,29589.75,3698.71875,15758.0,46953.0,4187.0,1,0.73193
5,right,mostly true,319,420293.0,1317.532915,265645.0,466888.0,75852.0,0,55.964912
6,right,mixture of true and false,169,341221.0,2019.059172,229518.0,322282.0,62265.0,1,29.649123
7,right,mostly false,82,268026.5,3268.615854,190386.0,252698.0,28932.0,2,14.385965


In [5]:
# Creating the csv:
tableau_df.to_csv('csv_collection/fb_engagement_table.csv', index=False)

In [6]:
# Verifying the csv:
check_df = pd.read_csv('csv_collection/fb_engagement_table.csv')

display(check_df.head(9))

Unnamed: 0,publication_type,factuality_type,count,total_engagement,average_engagement,share_count,reaction_count,comment_count,rating_num,percent_of_posts
0,left,mostly true,265,3376622.0,12741.969811,2313384.0,3746328.0,253312.0,0,74.647887
1,left,mixture of true and false,68,1327089.5,19516.022059,979944.0,1153774.0,117404.0,1,19.15493
2,left,mostly false,22,279340.5,12697.295455,177355.0,360502.0,23720.0,2,6.197183
3,mainstream,mostly true,1085,451457.25,416.089631,157798.0,728327.0,223155.0,0,99.26807
4,mainstream,mixture of true and false,8,29589.75,3698.71875,15758.0,46953.0,4187.0,1,0.73193
5,right,mostly true,319,420293.0,1317.532915,265645.0,466888.0,75852.0,0,55.964912
6,right,mixture of true and false,169,341221.0,2019.059172,229518.0,322282.0,62265.0,1,29.649123
7,right,mostly false,82,268026.5,3268.615854,190386.0,252698.0,28932.0,2,14.385965
