# Twitter Lists - Filtering

### Global Packages

In [39]:
import json
import pandas as pd

#### Parameters and Constants

In [54]:
PATH = "C:/Users/fdmol/Desktop/Energy-Lab/twitter_search/twitter_search/data/raw_data"
COLS_TO_KEEP = ["user_id", "list_id", "name", "description"]

LISTS_KEYWORDS = ["air", "pollution", "earth", "climate", "smog"]


#### Functions and Classes

In [47]:
def read_json(file_path):
    """
    Reads JSON file and returns a dictionary

    Args:
        file_path (str): Path to the JSON file
    Returns:
        data (dict): Dictionary with the JSON data
    """
    with open(file_path, "r") as file:
        data = json.load(file)
    return data


def create_df(twitter_lists):
    """
    Creates a DataFrame from the JSON data

    Args:
        lists (list): List of dictionaries with the JSON data
    Returns:
        df (pd.DataFrame): DataFrame with the JSON data
    """

    lists_df = pd.DataFrame([])

    for twitter_list in twitter_lists:
        if not twitter_list:
            # remove entry if empty
            continue

        else:
            # create a dataframe from the list
            list_df = pd.DataFrame(twitter_list)
            lists_df = pd.concat([lists_df, list_df], ignore_index=True)

    lists_df = lists_df.loc[:, COLS_TO_KEEP].copy()

    return lists_df

In [81]:
class ListFilter:
    """
    Class to filter lists based on keywords
    """

    def __init__(self, df) -> None:
        self.df = df

    @staticmethod
    def clean_text(text):
        """
        Cleans text from special characters

        Args:
            text (str): Text to be cleaned
        Returns:
            text (str): Cleaned text
        """
        # remove special characters
        text = text.lower()

        return text

    @staticmethod
    def filter_text(text):
        """
        Determines if the text contains a keyword
        of interest
        """
        text = text.lower()
        text_list = text.strip().split()

        for keyword in LISTS_KEYWORDS:
            if keyword in text_list:
                return True

        return False

    def is_relevant(self, row):
        """
        Creates an additional column to determine if the
        list is relevant or not
        """

        relevant_name = self.filter_text(row["name"])

        # If name is not relevant, check description
        if not relevant_name:
            # If description is not relevant, return False
            relevant_description = self.filter_text(row["description"])
            return relevant_description

        # If name is relevant, return True
        else:
            return relevant_name

#### Pipeline

In [82]:
twitter_lists = read_json(f"{PATH}/Mumbai_lists.json")
lists_df = create_df(twitter_lists)
lists_df.drop_duplicates(subset=["list_id"], inplace=True)

In [83]:
list_filter = ListFilter(lists_df)
lists_df["relevant"] = lists_df.apply(list_filter.is_relevant, axis=1)
relevant_lists = lists_df.loc[lists_df["relevant"] == True, :]

In [84]:
relevant_lists

Unnamed: 0,user_id,list_id,name,description,relevant
1,1410225032354172932,1732746835570430409,No to Air pollution,Stop Air pollution Stop Toxic Gases,True
10,1410225032354172932,1518253847445217280,Pollution,"Accounts/Posts related to land, air & water po...",True
14,1410225032354172932,194782213,EARTH 🌍 PROTECTING GROUP,TO PROTECT EARTH 🌍._ JAI SRI RAM.,True
113,1247376074272641025,1765717997334024210,Air quality,,True
114,1247376074272641025,1743504404563853614,Air pollution Researcher,People who are actively working to solve the a...,True
188,4429482316,766501326650048512,Smokey's friends,List for interesting people in the air quality...,True
192,4429482316,745346183120420865,Air Quality Experts,,True
193,4429482316,710779808536367104,smog 2016,,True
196,4429482316,718131431600484352,Air People,,True
