In [None]:
import pandas as pd
import numpy as np
import itertools
import os
import snscrape.modules.twitter as sntwt

In [None]:
# define a functions that deals with user filtering and user addition
def users_handler(user_info: sntwt.Tweet, filters: dict):
    """
    this function is used to filter the given Tweet instance by performing the following processes:
        1- extracting the user bio/description.
        2- check if the bio contains at least one of the keywordws present in filters dict.
        3- check if the user follwers count are greater than the limit present in filters dict.
    if the user passes these filters his/her information will be added to a data frame.
    
    :param: user_info -- an instance of snscrape.modules.twitter.Tweet class contains the information about collected tweet.
    :param: filters -- a dictionary that contains the filters which the user will be filtered against.
    
    :return: instance of snscrape.modules.twitter.Tweet class contains the information about the passed tweet.
        
    """
    # extracting user info from the collected tweet
    user_bio = user_info.user.rawDescription.lower()
    user_follower_count = user_info.user.followersCount
    
    if any(word.lower() in user_bio.split() for word in filters['keywords']):
        if not any(undesired_word.lower() in user_bio.split() for undesired_word in filters['unwanted keywords']):
            if user_follower_count > filters['followers_count']:
                return user_info
            else:
                return -1
        else:
            return -1            
    else:
        return -1



In [None]:
def users_adder(main_user_dict, per_itr_user_dict:dict, user_info: sntwt.Tweet):
    """
    this function is used to add the information of the passed user to the df
    :param: main_user_dict -- this is the main dictionary that contains the information about the passed users
                                {user_name:[list of usernames per user], 'url':[list of urls per user], 
                                location:[list of locations per user], #followers:[list of #followers per user]} 
    :param: per_itr_user_dict -- same as main_user_dict but it gets updated every iteration on the key word combination     
    :param: user_info -- instance of snscrape.modules.twitter.Tweet class contains the information about the passed tweet.
    
    :return: tuple of dictionary that contains the passed user info and integer that represent how many user are collected.
    """
    
    user_name = user_info.user.username
    
    user_bio = user_info.user.rawDescription.lower()
    
    user_url = user_info.url.split('status')[0]
    
    user_location = user_info.user.location.lower() 
    
    try:
        user_website = user_info.user_website
    except AttributeError:
        user_website = None
    
    user_follower_count = user_info.user.followersCount
    
    if not (user_name in main_user_dict['Username']): # cehck to not include duplicate data
        
        per_itr_user_dict['Username'].append(user_name)
        
        per_itr_user_dict['Bio'].append(user_bio)
        
        per_itr_user_dict['profile URL'].append(user_url)
        
        per_itr_user_dict['Location'].append(user_location)
        
        per_itr_user_dict['Websites'].append(user_website)
            
        per_itr_user_dict['#followers'].append(user_follower_count)
    
   
    return per_itr_user_dict, len(per_itr_user_dict['Username'])
    

# Using Snscrape

### Algorithm description:  

the general idea to find the user of interest is: to search Twitter for tweets that contain a set of keywords
because the users which we are interested in will likely contribute to these tweets.
we will take the following steps:
1. create a dictionary that contains:
    - words that we want our user's bio to include
    - min number of followers of each user
    - words that we do not want to include in our search
2. generate a combination of 2 words from the previously created keywords
3. initialize a main dictionary its key represents the required info to e collected about the users
4. looping on the created combination and for each combination:
    - create the search query
    - initialize a dictionary with the same structure as the main dictionary. its purpose is to store user information
    per iteration on the combination.
    - for each collected tweet:
        - check if the author of this tweet passes the specified criteria by utilizing users_handler() 
        - if the user pass, add the collected info to `per_iter_user_dict`
        - if # the collected users are greater than 10 per combination append the collected info to the main dictionary
        - break from the loop
5. create a data frame from the generated dictionary and save the file as CSV  

In [None]:
# setting the filters up 
filters = {'keywords':['CEO', 'vice president', 'president',
                      'chief', 'founder', 'co funder', 'CTO', 'Congress Women', 'Congress men',
                      'senator', 'MP', 'parliament', 'head', 'senior', 'Activist', 'creator', 'board member',
                      'Chairman', 'VP', 'Boss'],
           'unwanted keywords': ['sex', 'porn', 'adult', 'PLAYMATE', 'Model'],
           'followers_count':10000,}

# generating combination of the desired words 2 at a time
desired_words_combinations = list(itertools.combinations(filters['keywords'], 2))

# setting the unwanted words in tweets
undsired_words = ' -'.join(filters['unwanted keywords'])

main_user_dict = {'Username':[], 'Bio':[], 'profile URL':[], 'Location':[], 'Websites':[],'#followers':[]}

for word in desired_words_combinations:
    # setting the query
    desired_words = ' OR '.join(list(word))
    query = '({}) -{} lang:en until:2023-01-07 since:2020-01-01'.format(desired_words, undsired_words)
    print("search query is: {} \n".format(query))
    
    per_iter_user_dict = {'Username':[], 'Bio':[], 'profile URL':[], 'Location':[], 'Websites':[],'#followers':[]}
    
    for i, tweet in enumerate(sntwt.TwitterSearchScraper(query).get_items()):

        responce = users_handler(tweet, filters)
        if responce != -1 :
            user_dict, collected_users = users_adder(main_user_dict, per_iter_user_dict, tweet)
            if collected_users > 10:   
                print('breaking the loop')
                main_user_dict['Username'].extend(per_iter_user_dict['Username'])
                main_user_dict['Bio'].extend(per_iter_user_dict['Bio'])
                main_user_dict['profile URL'].extend(per_iter_user_dict['profile URL'])
                main_user_dict['Location'].extend(per_iter_user_dict['Location'])
                main_user_dict['Websites'].extend(per_iter_user_dict['Websites'])
                main_user_dict['#followers'].extend(per_iter_user_dict['#followers'])
                
                break
            elif i % 100 == 0:
                print(i)
            
            elif collected_users % 2 == 0:
                print("\n collected {} users".format(collected_users))


In [None]:
df = pd.DataFrame(main_user_dict)
final_df = df.drop(['#followers'], axis=True)

In [None]:
final_df

In [None]:
final_df.drop_duplicates(inplace=True)

In [None]:
final_df

In [None]:
final_df.to_csv("Twitter_user_data", index=False)

In [None]:
df.to_csv("Twitter_user_data_#count", index=False)

In [None]:
df

# Using Twitter API and Tweepy

In [None]:
import tweepy
import configparser
import pandas as pd
import itertools

In [None]:
# define a functions that deals with user filtering and user addition
def users_handler(user_info: tweepy.models.User, filters: dict):
    """
    this function is used to filter the given Tweet instance by performing the following processes:
        1- extracting the user bio/description.
        2- check if the bio contains at least one of the keywordws present in filters dict.
        3- check if the user follwers count are greater than the limit present in filters dict.
    if the user passes these filters his/her information will be added to a data frame.
    
    :param: user_info -- an instance of snscrape.modules.twitter.Tweet class contains the information about collected tweet.
    :param: filters -- a dictionary that contains the filters which the user will be filtered against.
    
    :return: instance of snscrape.modules.twitter.Tweet class contains the information about the passed tweet.
        
    """
    # extracting user info from the collected tweet
    user_bio = user_info.description.lower()
    user_follower_count = user_info.followers_count
    
    if any(word.lower() in user_bio.split() for word in filters['keywords']):
        if not any(undesired_word.lower() in user_bio.split() for undesired_word in filters['unwanted keywords']):
            if user_follower_count > filters['followers_count']:
                return user_info
            else:
                return -1
        else:
            return -1            
    else:
        return -1    

In [None]:
def users_adder(main_user_dict:dict, per_itr_user_dict:dict, user_info: tweepy.models.User):
    """
    this function is used to add the information of the passed user to the df
    :param: main_user_dict -- this is the main dictionary that contains the information about the passed users
                                {user_name:[list of usernames per user], 'url':[list of urls per user], 
                                location:[list of locations per user], #followers:[list of #followers per user]} 
    :param: per_itr_user_dict -- same as main_user_dict but it gets updated every iteration on the key word combination     
    :param: user_info -- instance of snscrape.modules.twitter.Tweet class contains the information about the passed tweet.
    
    :return: tuple of dictionary that contains the passed user info and integer that represent how many user are collected.
    """

    user_name = user_info.screen_name
    
    user_bio = user_info.description.lower()
    
    user_url = "https://twitter.com/{}".format(tweet.screen_name)
    
    user_location = user_info.location.lower() 
    
    user_website = user_info.url
    
    user_follower_count = user_info.followers_count
    
    if not (user_name in main_user_dict['Username']): # cehck to not include duplicate data, index=False
        
        per_itr_user_dict['Username'].append(user_name)
        
        per_itr_user_dict['Bio'].append(user_bio)
        
        per_itr_user_dict['profile URL'].append(user_url)
        
        per_itr_user_dict['Location'].append(user_location)
        
        per_itr_user_dict['Websites'].append(user_website)
            
        per_itr_user_dict['#followers'].append(user_follower_count)
    
   
    return per_itr_user_dict, len(per_itr_user_dict['Username'])

In [None]:
# read configs
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)


In [None]:
api = tweepy.API(auth ,wait_on_rate_limit=True)

public_tweets = api.home_timeline()

In [None]:
# setting the filters up cx
filters = {'keywords':['CEO', 'vice president', 'president',
                      'chief', 'founder', 'co funder', 'CTO', 'Congress Women', 'Congress men',
                      'senator', 'MP', 'parliament', 'head', 'senior', 'Activist', 'creator', 'board member',
                      'Chairman', 'VP', 'Boss'],
           'unwanted keywords': ['sex', 'porn', 'adult', 'PLAYMATE', 'Model'],
           'followers_count':10000,}

# generating combination of the desired words 2 at a time
desired_words_combinations = list(itertools.combinations(filters['keywords'], 2))

# setting the unwanted words in tweets
undsired_words = ' -'.join(filters['unwanted keywords'])

main_user_dict = {'Username':[], 'Bio':[], 'profile URL':[], 'Location':[], 'Websites':[],'#followers':[]}

for word in desired_words_combinations:
    # setting the query
    desired_words = ' OR '.join(list(word))
    query = '({}) -{} lang:en'.format(desired_words, undsired_words)
    print("search query is: {} \n".format(query))
    
    per_iter_user_dict = {'Username':[], 'Bio':[], 'profile URL':[], 'Location':[], 'Websites':[],'#followers':[]}
    
    tweets = tweepy.Cursor(api.search_users, q=query , count=20, include_entities=True).items(500)
    for i, tweet in enumerate(tweets):
        responce = users_handler(tweet, filters)
        if responce != -1 :
            user_dict, collected_users = users_adder(main_user_dict, per_iter_user_dict, tweet)
            if collected_users > 3:   
                main_user_dict['Username'].extend(per_iter_user_dict['Username'])
                main_user_dict['Bio'].extend(per_iter_user_dict['Bio'])
                main_user_dict['profile URL'].extend(per_iter_user_dict['profile URL'])
                main_user_dict['Location'].extend(per_iter_user_dict['Location'])
                main_user_dict['Websites'].extend(per_iter_user_dict['Websites'])
                main_user_dict['#followers'].extend(per_iter_user_dict['#followers'])
                print('breaking the loop')
                break

In [None]:
df = pd.DataFrame(main_user_dict)

In [None]:
df

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df

In [None]:
df.drop(['#followers'], axis=1, inplace=True)

In [None]:
df.to_csv("Twitter_Users_Data1.csv", index=False)