In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
polarities = pd.read_csv("./polarities.csv").dropna()
polarities.head()

In [None]:
polarities['user/name'].unique()

In [None]:
polarities['flagged'].value_counts()

### Exploratory Analysis

In [None]:
data = polarities[['flagged', 'pol_rightness', 'credibility', 'moderacy']]
data.describe()

In [None]:
pol_flagged = data[data['flagged']]['pol_rightness']
pol_unflagged = data[data['flagged'] == False]['pol_rightness']
sns.kdeplot(pol_flagged, shade=True).set_title("Political Polarization")
sns.kdeplot(pol_unflagged, shade=True)
plt.legend(title='Tweet Liked', loc='upper left', labels=['Flagged', 'Unflagged'])

In [None]:
pol_flagged = data[data['flagged']]['credibility']
pol_unflagged = data[data['flagged'] == False]['credibility']
sns.kdeplot(pol_flagged, shade=True).set_title("Credibility Polarization")
sns.kdeplot(pol_unflagged, shade=True)
plt.legend(title='Tweet Liked', loc='upper left', labels=['Flagged', 'Unflagged'])

In [None]:
pol_flagged = data[data['flagged']]['moderacy']
pol_unflagged = data[data['flagged'] == False]['moderacy']
sns.kdeplot(pol_flagged, shade=True).set_title("Moderacy Polarization")
sns.kdeplot(pol_unflagged, shade=True)
plt.legend(title='Tweet Liked', loc='upper left', labels=['Flagged', 'Unflagged'])

### Examining Correlation  
This section looks at whether there is a quantifiable and/or visual correlation between each of the three dimensions. Each of the below plots is made up of points, each one representing a user. The color of the point indicates whether that user was one who retweeted a flagged tweet or an unflagged tweet.

**Political vs Credibility**

In [None]:
data = polarities[['pol_rightness', 'credibility']]
sns.scatterplot(x=data['pol_rightness'], y=data['credibility'],
                hue=polarities['flagged']).set_title("User Political verus Credibility Polarity")

In [None]:
print("Correlation Matrix")
print(data.corr())

**Political vs Moderacy**

In [None]:
data = polarities[['pol_rightness', 'moderacy']]
sns.scatterplot(x=data['pol_rightness'], y=data['moderacy'],
                hue=polarities['flagged']).set_title("User Political vs Moderacy Polarity")

In [None]:
print("Correlation Matrix")
print(data.corr())

**Moderacy vs Credibility**

In [None]:
data = polarities[['moderacy', 'credibility']]
sns.scatterplot(x=data['credibility'], y=data['moderacy'],
                hue=polarities['flagged']).set_title("User Credibility verus Moderacy Polarity")

In [None]:
print("Correlation Matrix")
print(data.corr())

### Test 1
I will perform a permutation test for each dimension. That is, for one dimension I will calculate the difference between the sample means for all polarity scores for the flagged tweets group and then for the unflagged tweets groups. I will define the test statistic as the difference between those two means. I will then shuffle the flagged/unflagged labels and then recalculate the statistic. I will do this 100 times then calculate the p value.  
  
**Permutation Test:**  
  
Test Statistic: Define the test statistic as the difference between the sample means for the flagged and unflagged group for one dimension  
  
Null Hypothesis: There is no difference in the polarity scores between the flagged and unflagged groups  
  
Alternative Hypothesis: There is some difference in the polarity scores

In [None]:
# Defining functions for use in permutation test
def mean_diff(data, dim):
    return np.mean(data[data['flagged'] == True][dim]) - np.mean(data[data['flagged'] == False][dim])

def permutation_test(data, n_reps, dim):
    # Observed statistic
    obs = mean_diff(data, dim)
    
    # Running and permuting n_reps of the data
    trials = []
    for i in range(n_reps):
        shuffled_impres = (
            data['flagged']
            .sample(replace=False, frac=1)
            .reset_index(drop=True)
        )
        shuffled = (
            data
            .assign(**{'flagged': shuffled_impres})
        )
        trials.append(mean_diff(shuffled, dim))
    return np.count_nonzero(np.array(trials) >= obs) / n_reps

In [None]:
# First let's run the test for the political polarity
data = polarities[['pol_rightness', 'credibility', 'moderacy', 'flagged']]
outcomes = []
dimensions = ['pol_rightness', 'credibility', 'moderacy']
for dim in dimensions:
    out = (dim, permutation_test(data, 1000, dim))
    outcomes.append(out)
    print('Dimension: ' + out[0])
    print('p-value: ' + str(out[1]) + '\n')

**Evaluating:**  
The break down of the tests for each dimension is reported above. Notice that for the credibility and the moderacy dimensions, there is a more significant difference between the two groups than there is for the poltical dimension. Hopefully with more data this becomes more apparent. 

### Test 2
We will compare the data for each dimension using a t-test. Under this test we assume the data is normally distributed with the same variance. We will perform a two-sided test between the flagged and unflagged group for each dimension.  
  
**Two-sided t-test**  

In [None]:
outcomes = []
for dim in dimensions:
    flagged = data[data['flagged'] == True][dim]
    unflagged = data[data['flagged'] == False][dim]
    out = (dim, stats.ttest_ind(flagged, unflagged, equal_var=True))
    outcomes.append(out)
    print('Dimension: ' + out[0])
    print('p-value: ' + str(out[1]) + '\n')

**Outcome** 
We can see a very similar outcome as the permutation test, as we would expect. Next we will test the one-sided t-test for both groups

### Test 3
We will test whether flagged group has statistically significantly higher or lower polarity scores than the unflagged group for both dimensions. We will use a one-sided t-test.  
**One-sided t-test**

In [None]:
outcomes = []
for dim in dimensions:
    flagged = data[data['flagged'] == True][dim]
    unflagged = data[data['flagged'] == False][dim]
    out = (dim, stats.ttest_ind(flagged, unflagged, equal_var=True))
    outcomes.append(out)
    print('Dimension: ' + out[0])
    print('p-value: ' + str(out[1][1]/2))
    print('Test Statistic: ' + str(out[1][0]) + '\n')

**Outcome**  
First, let alpha = 0.1. Next, we can derive the outcome of the one-sided as the flagged polarities are significantly greater than the unflagged polarities if the p-value < alpha and the test statistic is > 0. Likewise if the test statistic is < 0 then it means the flagged polarities are significantly lesser than the unflagged polarities. Notice here that while there is no statistically significant outcome for the moderacy and credibility dimensions, we can see that the political polarity of users who retweeted a flagged tweet is statistically significantly greater than those of the users who retweeted an unflagged tweet. This is what we would expect since the tweets being flagged are coming from a user who is generally aligned with right wing views so the users that are interacting with a flagged tweet tend to have a higher polarity tending towards the right.  