In [1]:
import sys
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

In [2]:
def load_data(messages_filepath, categories_filepath):
    """
    Load  message and categories files,merge them and return a new dataframe 

    Parameters:
        messages_filepath: (str) CSV file.
        categories_filepath : (str) CSV file.

    Return:
        Merged pandas DataFrame.
    """
    messages = pd.read_csv(messages_filepath)
    
    categories = pd.read_csv(categories_filepath)
    
    df = pd.merge(messages, categories, on='id')
        
    return df


In [3]:
df=load_data('/home/bambar/Nano Degree/Project 2/data/disaster_messages.csv', '/home/bambar/Nano Degree/Project 2/data/disaster_categories.csv')

In [4]:
def clean_data(df):
    """
    Clean DataFrame, 
        expanding the multiple categories into seperate columns, 
        extract categories values, 
        replace the previous categories with new columns
        removing duplicates

    Args:
        df:dataframe containing messages and categories.

    Returns:
        DataFrame: Cleaned dataframe.

    """
    # split categories into seperate categories
    categories = df.categories.str.split(";", expand=True)
    
    # select the first row of the categories dataframe
    row = categories.iloc[0]
    
     # use the first row to extract categories names
    category_colnames = [i[:-2] for i in row]
    
    # rename the columns of `categories`
    categories.columns = category_colnames
    
    #convert categories values to numeric instead of strings
    for column in categories:
        categories[column] = [cat[len(cat)-1:] for cat in categories[column]]
        # convert column from string to numeric
        categories[column] =categories[column].astype(int)
        
        #pd.Series(categories[column], dtype="int64")
    
    # drop categories column in df 
    df.drop(columns = ['categories'], inplace=True)

    # Merge the original dataframe with the new `categories` dataframe
    df = pd.concat([df, categories], axis=1)
    #df = df.join(categories)

    #remove duplicates
    df.drop_duplicates(inplace=True)

    print("Duplicate Count=", df.duplicated().sum())
    
    return df

In [5]:
clean_data(df)

Duplicate Count= 0


Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26381,30261,The training demonstrated how to enhance micro...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26382,30262,A suitable candidate has been selected and OCH...,,news,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26383,30263,"Proshika, operating in Cox's Bazar municipalit...",,news,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26384,30264,"Some 2,000 women protesting against the conduc...",,news,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
def save_data(df, database_filename):
    """
    Save the cleaned data to a SQLite database.

    Args:
        df (pandas.DataFrame): Cleaned dataframe.
        database_filename (str): Filepath for the output SQLite database.
        
    Returns:
        None
    """
    engine = create_engine(f'sqlite:///{database_filename}')
    df.to_sql('categories', engine, if_exists='replace', index=False)

In [None]:
def main():
    """
    Main function to orchestrate the data processing pipeline.

    Reads command line arguments, loads data, cleans it, and saves it to a database.
    """
    if len(sys.argv) == 4:

        messages_filepath, categories_filepath, database_filepath = sys.argv[1:]

        print('Loading data...\n    MESSAGES: {}\n    CATEGORIES: {}'
              .format(messages_filepath, categories_filepath))
        df = load_data(messages_filepath, categories_filepath)

        print('Cleaning data...')
        df = clean_data(df)
        
        print('Saving data...\n    DATABASE: {}'.format(database_filepath))
        save_data(df, database_filepath)
        
        print('Cleaned data saved to database!')
    
    else:
        print('Please provide the filepaths of the messages and categories '\
              'datasets as the first and second argument respectively, as '\
              'well as the filepath of the database to save the cleaned data '\
              'to as the third argument. \n\nExample: python process_data.py '\
              'disaster_messages.csv disaster_categories.csv '\
              'DisasterResponse.db')


if __name__ == '__main__':
    main()