In [None]:
import pandas as pd
import ployly.express as px
from scipy.stats import pearsonr
# step 1 load the all tweet files
tweet_files=[
    "/content/Tweets_part1.csv",
    "/content/Tweets_part2.csv",
    "/content/Tweets_part3.csv",
    "/content/Tweets_part4.csv",
    "/content/Tweets_part5.csv",
    ]

tweets_parts=[]
for f in tweet_files:
  try:
    df=pd.read_csv(f,engine='python',quotechar='"',doublequote=True,on_bad_lines"skip')
    tweet_parts.append(df)
  tweets=pd.concat(tweets_parts,ignore_index=True
#Step 2 load the vaccination data
vax=pd.read_csv('/content/vaccinations.csv')
vax_latest=(
    vax.sort_values(['location', 'date'])
       .groupby('location', as_index=False)
       .last()[['location','people_vaccinated_per_hundred']]
       .rename(columns={'location':'country',
                        'people_vaccinated_per_hundred':'vax_percent'})
)
all_countries = vax_latest['country'].unique()

#step 3 matching the tweet
# STEP 3: match tweet user location to countries
def match_country(loc):
    if pd.isna(loc):
        return None
    loc = str(loc).lower()
    for c in all_countries:
        if c.lower() in loc:
            return c
    return None
tweets['country']=tweets['user_location'].apply(match_country)
tweets=tweets[tweets['country'].notnull()]
#step4 flag misinformation tweets
misinfo_words=[]'microchip','hoax','plandemic','5g','fake vaccine','dna change','magnet','tracking device']

def is_misinfo(txt):
    if pd.isna(txt):
        return False
    txt = txt.lower()
    return any(w in txt for w in misinfo_words)

tweets['is_misinformation'] = tweets['text'].apply(is_misinfo)
#step 5 calulate misinformation % per country
Summary = (tweets.groupby('country')
                .agg(num_misinfo=('is_misinformation','sum'),
                     total_tweets=('text','count'))
                .reset_index())
summary['misinfo_percent'] = (summary['num_misinfo'] / summary['total_tweets']) * 100
#step 6 merge with vaccination data
final=pd.merge(vax_latest,summmary[['country','misinfo_percent']].on='country',how='left')
final['misinfo_percent']=final['misinfo-percent'].fillna(0)
#step 7 choropleth map
fig_map = px.choropleth(
    final,
    locations='country',
    locationmode='country names',
    color='misinfo_percent',
    hover_name='country',
    hover_data={'misinfo_percent':':.2f','vax_percent':':.2f'},
    color_continuous_scale=['#f7fcf5','#e5f5e0','#c7e9c0','#a1d99b',
                            '#74c476','#41ab5d','#238b45','#006d2c','#00441b'],
    range_color=(0,100),
    title='Percentage of Misinformation Tweets by Country'
)
fig_map.update_layout(
    geo=dict(showframe=False, showcoastlines=True, projection_type='equirectangular', bgcolor='white'),
    coloraxis_colorbar=dict(title="Misinformation (%)", tickformat=".0f"),
    font=dict(size=14)
)
fig_map.show()

# STEP 8: scatterplot
fig_scatter = px.scatter(
    final,
    x='misinfo_percent',
    y='vax_percent',
    text='country',
    trendline='ols',
    title="Country-level % Misinformation Tweets vs. % Vaccinated",
    labels={'misinfo_percent':'% Misinformation Tweets',
            'vax_percent':'% Vaccinated'},
    color='misinfo_percent',
    color_continuous_scale='Greens'
)
fig_scatter.update_traces)textposition='top center',marker=dict(size=12,line=dict(width=1,color='DarkSlateGrey'))
fig_scatter.update_layout(template='plotly_white',width=900,height=550,
                          font=dict(size=15),margin=dict(l=60,r=20,t=70,b=60),
                          coloraxis_colorbar=dict(title='Misinformation(%)"))

fig_scatter.show()
#step 9 pearson corelation
corr, pval = pearsonr(final['misinfo_percent'], final['vax_percent'])

# step 10 saving reults into adn .csv file
final[['country','misinfo_percent,"vax_percent']].to_csv('/content/country_misinfo_vax_percent_clean.csv', index=False)

)













