### Dependencies

In [98]:
#Dependencies. Install if not previously installed
j = 1
while j<=1:
  try:
    import fasttext 
    import requests, os 
    import pandas as pd 
    import numpy as np
    from tqdm import tqdm
    from pycountry import languages #for formating
    import re, string #for preprocessing
    j=2
  except Exception as e:
    !pip install fasttext
    !pip install pycountry
    !pip install tqdm
    j+=1

### Functions


In [44]:
#download pretrained model if not downloaded before.
def download_fastText_model(path, overwrite_model = False) -> None:
    '''Download pretrained fasttext language detection model to specified path.
       REMEMBER 2 PAPERS TO REFERENECE FOR MODEL, URL: https://fasttext.cc/docs/en/language-identification.html'''    
    try:
        assert not (os.path.exists('data/lid.176.bin') and overwrite_model == False)
        r = requests.get('https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin')
        open(path,'wb').write(r.content)
    except AssertionError:
        print('Model exists and is not overwritten')


In [45]:
#Preprocesseing 
def preprocess_string(x:'input string',
                     remove_URLs = True,
                     remove_tagged_user = True,
                     lowercase = True,
                     remove_numbers = True,
                     remove_punctuation = True,
                     remove_extra_white_space = True,
                     remove_stopwords = True, stopwords:'list of stopwords' = None,
                     remove_listofwords = True, listofwords:'list of other words to be removed' = None
                     ) -> 'preprocessed string':
    
    '''This function converts values to strings and apply a set of preprocessing steps. '''

    cleaned = str(x)
    
    #0 Remove URLs
    if remove_URLs == True:
       cleaned = re.sub('(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?','',cleaned)

    #1 Remove all tagged users, i.e. words starting with the ‘@’ character.
    if remove_tagged_user == True:
        cleaned = re.sub('@\w*','',cleaned)
    
    if remove_listofwords == True and listofwords != None:
        cleaned = ' '.join([word for word in cleaned.split() if word not in listofwords])

    #2 Lowercase all tweet text.
    if lowercase == True:
        cleaned = cleaned.lower()

    #3 Remove numbers.
    if remove_numbers == True:
        cleaned = re.sub('[0-9]','',cleaned)

    #4 Remove punctuation. 
    if remove_punctuation == True: 
        cleaned = cleaned.translate(str.maketrans('', '', string.punctuation))

    #5 Remove extra whitespaces.
    if remove_punctuation == True:
        cleaned = re.sub("\s+"," ",cleaned)
    
    #6 Remove stopwords.
    if remove_stopwords == True and stopwords != None:
        cleaned = ' '.join([word for word in cleaned.split() if word not in stopwords])

    return cleaned

In [46]:
def find_language(x, model:'pretrained fastText model' ) -> '(estimate, estimate_confidence)':
    '''Finds the most relevant language using a pretrained fastTrack api, which 
    can be downloaded with the download_fastText_model function.'''

    #predict from pretrained fastText-model:
    pred = model.predict(x)
    
    #convert results to language name
    language_iso = re.sub('__label__','',pred[0][0])
    try:
        language_name = languages.get(alpha_2 = language_iso).name
    except Exception as e:
        language_name = None 
    #prediction acc.
    acc = pred[1][0]

    return (language_name, acc)

In [47]:
def isretweet(x:'string'):
    try:
        assert re.match('^RT', string = x)
        return True
    except AssertionError:
        return False

In [106]:
def twitter_sample_to_excel(df, sample_count,path, sample_names = None, sample_size_unique = 0, sample_size_collective = 0):
    ''' Create samples and export them to folder '''
    unique_sample = [df.sample(n = sample_size_unique) for x in range(sample_count)]
    collective_sample = df.sample(n = sample_size_collective)
    combined_sample = [x.append(collective_tweets) for x in unique_sample]
    
    if sample_names == None: sample_names = [str(x) for x in range(sample_count)]
    
    [x.to_excel(f'{path}/{sample_names[y]}.xlsx') for y, x in enumerate(combined_sample)]


### Script

In [48]:
#import tweets
tweets = pd.read_excel('data/resistance_tweets.xlsx')


In [49]:
#Clean tweets
tqdm.pandas()
tweets['tweet_cleaned'] = tweets['tweet'].progress_apply(lambda x: preprocess_string(x,listofwords = ['RT']))

#import model
PRETRAINED_MODEL_PATH = 'data/lid.176.bin'
download_fastText_model(download_fastText_model, overwrite_model = False) #downloads model if not downloaded before
model = fasttext.load_model(PRETRAINED_MODEL_PATH) #create model object to be parsed in the find language function


  from pandas import Panel
100%|██████████| 240423/240423 [00:21<00:00, 11054.58it/s]
Model exists and is not overwritten


#### Find language of tweets on a per tweet basis

In [50]:
#Find relevant language
tweets['language'] = tweets['tweet_cleaned'].progress_apply(lambda x: find_language(x, model = model)) 

#format DataFrame 
new_col_list = ['language_estimate','language_estimate_acc']
for n,col in enumerate(new_col_list):
    tweets[col] = tweets['language'].apply(lambda x: x[n])
tweets = tweets.drop('language',axis=1)



100%|██████████| 240423/240423 [00:14<00:00, 16185.39it/s]


In [51]:
language_table = tweets.groupby('language_estimate').agg({'tweet':'count', 'language_estimate_acc':'mean'})
language_table.sort_values(by='tweet',ascending = False).head(10)
#tweets.loc[tweets['language_estimate']=='Dutch',:]

Unnamed: 0_level_0,tweet,language_estimate_acc
language_estimate,Unnamed: 1_level_1,Unnamed: 2_level_1
English,172786,0.824318
Dutch,38065,0.938708
German,21623,0.963159
French,2748,0.300501
Japanese,2160,0.967022
Finnish,1622,0.954531
Spanish,155,0.461261
Russian,144,0.488025
Portuguese,100,0.495158
Serbian,90,0.473764


#### Language detection on a per author basis

In [52]:
#create retweet column
tweets['retweet'] = tweets['tweet'].progress_apply(lambda x: isretweet(str(x)))
#Concatenate tweets by author (no retweets)
tweets['tweet_cleaned_no_rt'] = np.where(tweets['retweet']==True, '', tweets['tweet_cleaned'] )

tweets['AllTweetsByAuthor'] = tweets.groupby(['username'])['tweet_cleaned_no_rt'].transform(lambda x: ' '.join(x))
tweets_all = tweets.drop_duplicates(['username']).copy()

#Find relevant language
tweets_all['language'] = tweets_all['AllTweetsByAuthor'].progress_apply(lambda x: find_language(str(x), model = model)) 

#format DataFrame 
new_col_list = ['author_language_estimate','author_language_estimate_acc']
for n,col in enumerate(new_col_list):
    tweets_all[col] = tweets_all['language'].progress_apply(lambda x: x[n])
tweets_all = tweets_all.drop('language',axis=1)
tweets = pd.merge(left = tweets, right = tweets_all.loc[:,['username','author_language_estimate','author_language_estimate_acc' ]], left_on = 'username', right_on = 'username' )



100%|██████████| 240423/240423 [00:00<00:00, 249133.92it/s]
100%|██████████| 230/230 [00:02<00:00, 83.59it/s]
100%|██████████| 230/230 [00:00<00:00, 115118.13it/s]
100%|██████████| 230/230 [00:00<00:00, 234034.43it/s]


In [97]:
#drop concatenated tweet column
#tweets = tweets.drop('AllTweetsByAuthor', axis = 1)

#Create english tweet dummy
tweets['EnglishTweet'] = pd.get_dummies(
                    (tweets['author_language_estimate']=='English') & 
                    ((tweets['language_estimate'] == 'English')  |  (tweets['language_estimate_acc'] <0.5)),
                    drop_first = True)

#save
tweets.to_csv('data/preprocessed_tweets_with_language.csv')


## Create sample of tweets to code manually

Preleminary open code sample: 20 unique tweets per person and 20 tweets that everyone get.

In [93]:
df_to_sample = tweets.loc[(df['retweet']==False),:]

twitter_sample_to_excel(df_to_sample, 4,'data', sample_size_unique = 20, sample_size_collective = 20)


Create sample with only english tweets by english classified authors. A tweet is english if the accuracy score of the FastText api is either english or if it is another english (by an english author) with an accuracy of less than x pct.

In [94]:
df_to_sample = tweets.loc[(tweets['EnglishTweet'] == 1) & (df['retweet']==False),:]

twitter_sample_to_excel(df_to_sample, 4,'data', sample_size_unique = 0, sample_size_collective = 20)


df_to_sample

#### Produce 500 tweets for manual coding after testing above

In [107]:
df_to_sample = tweets.loc[(tweets['EnglishTweet'] == 1) & (df['retweet']==False),['tweet_id','tweet']]
twitter_sample_to_excel(df_to_sample, 4, 'data', 
                        sample_names = ['E_500_tweets','AM_500_tweets','AJ_500_tweets', 'K_500_tweets'],
                        sample_size_unique = 500,
                        sample_size_collective = 0)

In [104]:
?twitter_sample_to_excel

[1;31mSignature:[0m
[0mtwitter_sample_to_excel[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0msample_count[0m[1;33m,[0m[1;33m
[0m    [0mpath[0m[1;33m,[0m[1;33m
[0m    [0msample_names[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msample_size_unique[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m    [0msample_size_collective[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m Create samples and export them to folder 
[1;31mFile:[0m      c:\users\espen\documents\sds\dm_and_asds2_exam_project\<ipython-input-91-b8695177e4ab>
[1;31mType:[0m      function
