# Useful Python Functions

This file is created to gather Python functions used during Data Analyst training between October 2023 and September 2024.
This might cover anything from going through files to machine learning functions.

# Data Ethics - Machine Learning

## Data Anonymisation Function

Created by Déborah Leclercq

- Library needed for this function to work:
- library anonypy to anonymise data
- import anonypy

In [None]:
def anonymisator(df, categorical, cols_to_anonymise, cols_to_keep, sensitive_columns, k):
    ''' 
    Anonymize specific columns of a dataframe based on k-anonymity.
    Ensure you install library called anonypy

    Parameters:
    df (pd.DataFrame): The original dataframe.
    categorical (list): List of categorical column names.
    cols_to_anonymise (list): List of columns that need to be anonymised.
    cols_to_keep (list): Columns that should not be modified or anonymized.
    sensitive_columns (list): Columns with sensitive information that shouldn't be modified.
    k (int): Level of anonymization for k-anonymity.

    Returns:
    pd.DataFrame: A dataframe with anonymized data.
    '''

    # Ensure the categorical columns are correctly set as category dtype
    for name in categorical:
        df[name] = df[name].astype("category")

    # Create a unique identifier (rownumber) for merging later
    df['rownumber'] = df.index

    # Initialize the anonymizer with the relevant columns
    p = anonypy.Preserver(df, cols_to_anonymise, cols_to_keep + sensitive_columns)

    # Perform the k-anonymity anonymization
    rows = p.anonymize_k_anonymity(k=k)

    # Convert the anonymized rows into a new DataFrame
    df_anonyme = pd.DataFrame(rows)

    # Drop the 'count' column created during anonymization if it exists
    if 'count' in df_anonyme.columns:
        df_anonyme = df_anonyme.drop(columns=['count'])

    # Merge anonymized data back with the original dataframe on 'rownumber'
    dataset_anonymised = df.drop(columns=cols_to_anonymise).merge(df_anonyme, how='left', on='rownumber')

    # Drop the 'rownumber' column as it's no longer needed
    dataset_anonymised = dataset_anonymised.drop(columns=['rownumber'])

    return dataset_anonymised

## Preprocessing Data Function

Created by Déborah Leclercq

- Libraries needed for the next function:
- from sklearn.preprocessing import StandardScaler
- from sklearn.model_selection import train_test_split
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.metrics import confusion_matrix,  precision_score, recall_score, accuracy_score, f1_score

In [None]:
# create function to train and test after anonymising data
def preprocessor(df, multicategorical_features, continuous_features, target='target', stratify_col='sex', test_size=0.3, random_state=41):
    '''
    Preprocesses the data, trains a KNN classifier, and evaluates performance.
    
    Parameters:
    - df (pd.DataFrame): Input dataframe.
    - multicategorical_features (list): List of multi-categorical feature columns.
    - continuous_features (list): List of continuous feature columns to scale.
    - target (str): Target column name.
    - stratify_col (str): Column to use for stratification (e.g., 'sex').
    - test_size (float): Proportion of test data. Default is 0.3.
    - random_state (int): Random state for reproducibility.

    Returns:
    - dict: A dictionary of evaluation metrics.
    '''

    # One-hot encoding for multi-categorical features
    dataset = pd.get_dummies(df, columns=multicategorical_features)
    
    # Separate features (X) and target (y)
    y = dataset[target]
    X = dataset.drop([target], axis=1)
    
    # Splitting data into train and test sets with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=random_state, 
        test_size=test_size, stratify=df[stratify_col]  # Only stratifying based on one column
    )

    # Scaling continuous features
    standardScaler = StandardScaler()
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    
    X_train_scaled[continuous_features] = standardScaler.fit_transform(X_train[continuous_features])
    X_test_scaled[continuous_features] = standardScaler.transform(X_test[continuous_features])
    
    # Training KNN classifier with k=15
    knn_clf = KNeighborsClassifier(n_neighbors=15)
    knn_clf.fit(X_train_scaled, y_train)
    
    # Predictions
    y_test_pred = knn_clf.predict(X_test_scaled)
    
    # Returning evaluation metrics
    return {
        'accuracy_score': accuracy_score(y_test, y_test_pred),
        'confusion_matrix': confusion_matrix(y_test, y_test_pred),
        'precision_score': precision_score(y_test, y_test_pred),
        'recall_score': recall_score(y_test, y_test_pred),
        'f1_score': f1_score(y_test, y_test_pred)
    }

# Web Scraping

Created by Déborah Leclercq

- Below function was created to retrieve lines from a website on a project called webscraping Manga.
- You will need to import the following libraries:
- import pandas as pd
- import requests
- from bs4 import BeautifulSoup

In [None]:
# Defines the function to extract manga data from a given URL.
def extract_manga_data(url):
    for row in manga_rows:  # Loops through each row in 'manga_rows', which should contain individual manga entries.
        
        # Finds the title of the manga in the 'h3' tag with class 'lister-item-header', extracts the link text, and removes leading/trailing whitespace.
        title = row.find("h3", class_="lister-item-header").a.text.strip()
        
        # Tries to find the rating element in a span with the class 'ipl-rating-star__rating'.
        rating_element = row.find("span", class_="ipl-rating-star__rating")
        
        # Checks if the rating element exists. If so, extracts and cleans the rating text; otherwise, sets rating to None.
        if rating_element:
            rating = rating_element.text.strip()
        else:
            rating = None
        
        # Finds the release year of the manga, strips out parentheses from the 'lister-item-year' span text.
        year = row.find("span", class_="lister-item-year").text.strip("()")
        
        # Extracts the genre information from the 'genre' span and strips any extra whitespace.
        genre = row.find("span", class_="genre").text.strip()
        
        # Appends a dictionary containing the manga title, year, rating, and genre to the 'manga_data' list.
        manga_data.append({"Title": title, "Year": year, "Rating": rating, "Genre": genre})
    
    # Returns the list of dictionaries, each representing a manga with its details.
    return manga_data

## Conversion Function

Created by Déborah Leclercq

- This function requires Regular Expression Library:
- Import re

In [None]:
 # v: str -> int (The function takes a string input and returns an integer)
def ma_function_de_traitement(v): 
    import re  # Importing the 're' module to use regular expressions.
    
    # 'r' is a raw string pattern that matches any single character inside square brackets.
    # The pattern "\[.\]" is used to match anything inside square brackets (e.g., "[a]").
    r = r"\[.\]"
    
    v = str(v)  # Converts the input 'v' to a string, in case it isn't already a string.
    
    # 're.sub' is used to substitute any match of the pattern 'r' (i.e., anything in square brackets)
    # with an empty string (i.e., it removes the matched part).
    result = re.sub(r, "", v)
    
    # This pattern matches digits between 0 and 9 before a slash ('/'). The '+' ensures it matches 
    # one or more digits, and '/.*' captures the slash and any characters after it.
    slash_paulette = r"([0-9]+)/.*"
    
    # 're.sub' replaces the part that matches 'slash_paulette' (digits followed by slash and more) 
    # with just the digits (first captured group). The result is cleaned of any unwanted characters like '\x01'.
    result = re.sub(slash_paulette, "\1", result).replace('\x01', '')
    
    # Converts the final result to an integer if the result is not empty. If empty, it returns 0.
    return int(result) if result else 0


# Functions created during Internship

Created by Déborah Leclercq

This function was needed during internship period to be able to 
isolate specific string information in a column so that is could be split at 
a specified wording type
this requires 
import re - regular expression library
import numpy as np - to help manage the nan values when there were some

In [None]:
# import re 
# regex for regular expressions - helps search for specific strings
# import numpy as np # helps manage nan values

def extract_phase(sentence):
    '''this function will enable you to search for the location of a specific string value
    
    Input:
        sentence (str): The string in which the function searches for the pattern.
    
    Output:
        str or np.nan: 
            Returns the span of the matched pattern within the sentence, 
            represented as a tuple containing the start and end indexes. 
            Returns np.nan if the input sentence is NaN.
    This will provide the span of the expression you were searching for aka locate the exact start and end indexes where the value
    is located
    
    First you manage the Nan values if any are in your searching area and it returns the Nan 
    
        expression : you need to feed the regex you are looking for
    
    check out the website https://regex101.com to feed your string and get the exact regex expression you need to 
    feed this variable
    
        pattern : the compilation of your expression to be able to search through your chosen column values
    
    re.search : using the search from regex to enable you to look for any match of your expression 
    https://docs.python.org/3/library/re.html#re.Pattern.search
    
    you can modify it to fit better your needs and visit above python library documentation if needed
    
    the pattern search specifies that you need the exact indexes of the expression you are searching
    
    the return feeds you the start index and end index of location of your expression

    '''
    if pd.isna(sentence):  # Check for NaN values
        return np.nan
    else:#the expression is regex equivalent to find any phase info or other string used to include in the new cols
        expression = r"Phase ['2\/3''I\/II''II\/III''II\-III'IV\da|b]+|Preclinical ['planned'\d]+|Inactive|Discontinued|X|Unknown|Preclinical|Discovery|Marketed|preclinical|In Vitro|Withdrawn|In vitro|x|\?|Invitro" 
        #regex expression to search for all types of phases as development stage
        pattern = re.compile(expression)#creates the pattern to look for with regex
        if re.search(pattern, sentence):#scan through string and look for any match from the pattern fed
            start, end = pattern.search(sentence).span()#gets the span where the pattern is located 
            #aka 0 to 9 are the indexes where the pattern is situated eg result = re.Match object; span=(0, 1), match='d'>
            return sentence[start:end]# returns the span location


# Gather several DFs in one excel doc

Created by Déborah Leclercq

This is not a function but was a useful tool during internship to gather in one excel file several df created while cleaning and reformatting the file.
- this requires import pandas as pd

In [None]:
# This code writes multiple DataFrames into an Excel file, with each DataFrame in a separate sheet.

# Create an Excel writer object to write multiple sheets to one file.
with pd.ExcelWriter("./output_data.xlsx") as writer:  # Replace the file path with a generic name.
    df_A.to_excel(writer, sheet_name='sheet_A', index=False)  # Write the first DataFrame to 'sheet_A'.
    df_B.to_excel(writer, sheet_name='sheet_B', index=False)  # Write the second DataFrame to 'sheet_B'.
    df_C.to_excel(writer, sheet_name='sheet_C', index=False)  # Write the third DataFrame to 'sheet_C'.

# Find a specific column

Created by Déborah Leclercq

If you have a df with several cols and some are being repeated throughout and you need to identify the first one then this function will be useful
- this requires pandas library - import pandas as pd

In [None]:
def find_first_comments_column(df):
    for column in df.columns:
        if column.startswith("Col_name_to_search"):
            return df.columns.get_loc(column)
    # Si aucune colonne "Comments" n'est trouvée, retourner -1
    return -1

# Use this function to find the index of the first column name "Comments"
index_comments = find_first_comments_column(df)
print("First cols 'Comments' index is :", index_comments)


# get the columns and their related indexes

Created by Déborah Leclercq

This is a function which will allow to display all cols and their related indexes
- requires pandas library - import pandas as pd

In [None]:
# write all index and related cols
def index_cols_info(df):
    for i, column_name in enumerate(df.columns):
        index_value = df.index[i]
        print(f"Index:{index_value}, Column: {column_name} ")

# Separating a df at a specific column

Created by Déborah Leclercq

isolate each section of the df related to a separate selected part and call the subdf by a specific name
- this was created because I need to slice my df at a particular section and needed to identify it
- this requires pandas library - import pandas as pd

In [None]:
import pandas as pd

def slice_dataframe(df, start_indices, end_indices):
    """Slice DataFrame into sub-DataFrames using start and end indices.
    Slice the DataFrame into sub-DataFrames based on the specified indices of "Disease" and "Comments" columns.

    Parameters:
    - df: DataFrame to slice.
    - start_indices: List of indices of "Disease" columns.
    - end_indices: List of indices of "Comments" columns.

    Returns:
    - Dictionary containing the sliced sub-DataFrames with keys like 'df_1', 'df_2', etc.
    """
    dfs = {}
    for i, (start, end) in enumerate(zip(start_indices, end_indices)):
        key = f'df_{df.columns[start]}'  # Use the first value of each selected column section as key
        dfs[key] = df.iloc[:, start:end]
    return dfs
