<a href="https://colab.research.google.com/github/Flychuban/Predict-Cancer-Mortality-Rates/blob/main/PredictCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/content/drive/MyDrive/CancerMortalityRatesPredict/data/cancer_reg.csv")

In [3]:
def find_constant_columns(dataframe):
    """
    This function takes in a dataframe and returns the columns that contain a single value.

    Parameters:
    dataframe (pandas.DataFrame): The dataframe to be analyzed

    Returns:
    list: A list of columns that contain a single value
    """
    constant_columns = []
    for column in dataframe.columns:
        # Get unique values in the column
        unique_values = dataframe[column].unique()
        # check if the column contains only one unique value
        if len(unique_values) == 1:
            constant_columns.append(column)
    return constant_columns

In [4]:
def delete_constant_columns(dataframe, columns_to_delete):
    """
    This function takes in a dataframe and a list of columns to delete and deletes the columns that contain a single value.

    Parameters:
    dataframe (pandas.DataFrame): The dataframe to be analyzed
    columns_to_delete (list): A list of columns to delete

    Returns:
    pandas.DataFrame: The dataframe with columns that contain a single value deleted
    """
    # Delete the specified columns
    dataframe = dataframe.drop(columns_to_delete, axis=1)
    return dataframe

In [5]:
def find_columns_with_few_values(dataframe, threshold):
    """
    This function takes in a dataframe and a threshold value as input and returns the columns that have less than the threshold number of unique values.

    Parameters:
    dataframe (pandas.DataFrame): The dataframe to be analyzed
    threshold (int): The minimum number of unique values required for a column

    Returns:
    list: A list of columns that have less than the threshold number of unique values
    """
    few_values_columns = []
    for column in dataframe.columns:
        # Get the number of unique values in the column
        unique_values_count = len(dataframe[column].unique())
        # Check if the column has less than the threshold number of unique values
        if unique_values_count < threshold:
            few_values_columns.append(column)
    return few_values_columns

In [6]:
def find_duplicate_rows(dataframe):
    """
    This function takes in a dataframe as input and returns the rows that contain duplicate data.

    Parameters:
    dataframe (pandas.DataFrame): The dataframe to be analyzed

    Returns:
    pandas.DataFrame: The dataframe containing duplicate rows
    """
    # Identify duplicate rows
    duplicate_rows = dataframe[dataframe.duplicated()]
    return duplicate_rows

In [7]:
def delete_duplicate_rows(dataframe):
    """
    This function takes in a dataframe as input and deletes the rows that contain duplicate data.

    Parameters:
    dataframe (pandas.DataFrame): The dataframe to be analyzed

    Returns:
    pandas.DataFrame: The dataframe without duplicate rows
    """
    # Drop duplicate rows
    dataframe = dataframe.drop_duplicates(keep="first")
    return dataframe

In [8]:
def drop_and_fill(dataframe):
    # Get the columns with more than 50% missing values
    cols_to_drop = dataframe.columns[dataframe.isnull().mean() > 0.5]
    # Drop the columns
    dataframe = dataframe.drop(cols_to_drop, axis=1)
    # Fill the remaining missing values with the mean of the column
    dataframe = dataframe.fillna(dataframe.mean())
    return dataframe

In [9]:
def split_data(dataframe, target_column):
    """
    This function takes in a dataframe and a target column as input and splits the dataframe into a feature dataframe and a target dataframe.

    Parameters:
    dataframe (pandas.DataFrame): The dataframe to be analyzed
    target_column (str): The name of the target column

    Returns:
    pandas.DataFrame: The dataframe containing the features
    pandas.DataFrame: The dataframe containing the target column
    """
    # Split the dataframe into a feature dataframe and a target dataframe
    X = dataframe.drop(target_column, axis=1)
    y = dataframe[target_column]
    X_train, X_test, y_train, y_test = tts(X, y, test_size=0.2, random_state=0)
    return (X_train, X_test, y_train, y_test)

In [10]:
def bin_to_num(data):
    binnedinc = []
    for i in data["binnedinc"]:
        # remove the parentheses and brackets
        i = i.strip("()[]") 
        print(i)
        # split the string into a list after splitting by comma
        i = i.split(",")
        print(i)
        # convert the list to a tuple
        i = tuple(i) 
        print(i)
        # convert individual elements to float
        i = tuple(map(float, i)) 
        print(i)
        # convert the tuple to a list
        i = list(i) 
        print(i)
        # append the list to the binnedinc list
        binnedinc.append(i)
    data["binnedinc"] = binnedinc

    # make a new column lower and upper bound
    data["lower_bound"] = [i[0] for i in data["binnedinc"]]  # lower bound
    data["upper_bound"] = [i[1] for i in data["binnedinc"]]  # upper bound
    # and also median point
    data["median"] = (data["lower_bound"] + data["upper_bound"]) / 2
    # drop the binnedinc column
    data.drop("binnedinc", axis=1, inplace=True)
    return data

In [11]:
def cat_to_col(data):
    # make a new column by splitting the geography column
    data["county"] = [i.split(",")[0] for i in data["geography"]]
    data["state"] = [i.split(",")[1] for i in data["geography"]]
    # drop the geography column
    data.drop("geography", axis=1, inplace=True)
    return data

In [12]:
def one_hot_encoding(X):
    # select categorical columns
    categorical_columns = X.select_dtypes(include=["object"]).columns
    # one hot encode categorical columns
    one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
    one_hot_encoded = one_hot_encoder.fit_transform(X[categorical_columns])
    # convert the one hot encoded array to a dataframe
    one_hot_encoded = pd.DataFrame(
        one_hot_encoded, columns=one_hot_encoder.get_feature_names_out(categorical_columns)
    )
    # drop the categorical columns from the original dataframe
    X = X.drop(categorical_columns, axis=1)
    # concatenate the one hot encoded dataframe to the original dataframe
    X = pd.concat([X, one_hot_encoded], axis=1)
    return X