In [1]:
#Author: @michaelbrink
#Org: BalloonBox Inc.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

# Imported the data files

In [3]:
# Read in the original datasets
df_all=pd.read_csv('Fortune500.csv')
df_detail=pd.read_csv('Fortune500-2.csv')

## Data Cleaning

In [4]:
# firstly, drop the duplicated and useless columns in df_detail
feature_list = df_detail.columns
drop_list = ['Revenues ($M).1','Profits ($M).1','Unnamed: 25']
for feature in feature_list:
    if feature in drop_list:
        df_detail.drop(columns=feature,inplace=True)

In [5]:
def data_cleaning(df_origin):

    # Make a copy
    df = df_origin.copy()

    # The list of text features that don't need to clean
    text_list=['Name','Country','Headquarters','Industry','CEO','Website','Company Type','Ticker']

    # The list of all the features
    feature_list=df.columns

    # For loop for cleaning
    for col in feature_list:
        # Text data doesn't need to clean
        if col in text_list:
            pass
        else:
            # 1: The '-' value in '%' columns gonna be 0!
            df[col] = df[col].replace(to_replace='[\$,%]',value='',regex=True)

            # 2: The '-' value in ($M) columns will be dropped
            if ('%' in col) or ('Change' in col) or ('Return' in col):
                df[col] = df[col].replace(to_replace='^-$',value='0',regex=True)
            
            # 3: Convert them into numeric values
            df[col] = pd.to_numeric(df[col],errors='coerce')
            
            # # 4: Divide by 100 for all '%' columns
            # if ('%' in col) or ('Return' in col):
            #     df[col] = round(df[col]/100,5)
    return df

In [6]:
# Data cleaning for df_all dataset
df_all_cleaned = data_cleaning(df_all)

print('The shape of original df_all is :',df_all.shape)

# Drop null value
if df_all_cleaned.isna().sum().sum() != 0:
    df_all_cleaned.dropna(inplace=True)

print('The shape of cleaned df_all is : ',df_all_cleaned.shape)

The shape of original df_all is : (4000, 14)
The shape of cleaned df_all is :  (3774, 14)


In [7]:
# Data cleaning for df_detail dataset
df_detail_cleaned = data_cleaning(df_detail)

print('The shape of original df_detail is :',df_detail.shape)

# Drop null value
if df_detail_cleaned.isna().sum().sum() != 0:
    df_detail_cleaned.dropna(inplace=True)

print('The shape of cleaned de_detail is : ',df_detail_cleaned.shape)

The shape of original df_detail is : (1000, 23)
The shape of cleaned de_detail is :  (924, 23)


In [8]:
def drop_by_quantile(df_origin,cols):
    '''
        df is the dataframe we want to process: DataFrame
        cols is a list of columns we want to process: List
    '''
    # Make a copy of DataFrame
    df = df_origin.copy()

    # List contains the index of dropped rows
    drop_list = []

    for col in cols:
        # drop the outliers based on boxplot
        temp = list(df[col].quantile([0.25,0.75]))
        Q1 = temp[0]
        Q3 = temp[1]
        IQR = Q3-Q1
        minimum = Q1-1.5*IQR
        maximum = Q3+1.5*IQR
        # Append the index of dropped rows
        low_outlier = df[df[col]<=minimum].index
        high_outlier = df[df[col]>=maximum].index
        for index in low_outlier:
            drop_list.append(index)
        for index in high_outlier:
            drop_list.append(index)
        
        # Deduplicated index
        drop_list = list(dict.fromkeys(drop_list))
    
    # Drop the outliers
    df.drop(drop_list,axis=0,inplace=True)

    # # Histogram
    # plt.hist(df['Employees'],bins=5)

    return df

In [9]:
# Process df_all_cleaned dataset
drop_cols = ['Revenue ($M)','Profit ($M)','Assets ($M)','Market Value ($M)','Employees']
df_all_dropped = drop_by_quantile(df_all_cleaned,drop_cols)

# Results from each step
print('The shape of original df_all is :',df_all.shape)
print('The shape of cleaned df_all is : ',df_all_cleaned.shape)
print('The shape of dropped df_all(quantiles) is : ',df_all_dropped.shape)

The shape of original df_all is : (4000, 14)
The shape of cleaned df_all is :  (3774, 14)
The shape of dropped df_all(quantiles) is :  (2755, 14)


In [10]:
# Process df_detail_cleaned dataset
drop_cols = ['Revenues ($M)','Profits ($M)','Market Value ($M)','Employees','Assets ($M)',\
             'Total Stockholder Equity ($M)','Earnings Per Share ($)']
df_detail_dropped = drop_by_quantile(df_detail_cleaned,drop_cols)

# Results from each step
print('The shape of original df_all is :',df_detail.shape)
print('The shape of cleaned df_all is : ',df_detail_cleaned.shape)
print('The shape of dropped df_all(quantiles) is : ',df_detail_dropped.shape)

The shape of original df_all is : (1000, 23)
The shape of cleaned df_all is :  (924, 23)
The shape of dropped df_all(quantiles) is :  (611, 23)


In [11]:
def create_buckets(df_origin,cols):
    # Make a copy of origin dataset
    df = df_origin.copy()
    temp_df = pd.DataFrame()

    # Create 5 equal size buckets
    for col in cols:
        col_value = df[col]
        qcut_value, bins = pd.qcut(x=col_value,q=[0,0.2,0.4,0.6,0.8,1.0],labels=[1,2,3,4,5],retbins=True)
        temp_df[col+'_cat'] = qcut_value
    
    # Get dummies
    temp_df = pd.get_dummies(temp_df)

    # Concat the dataFrame together
    df = pd.concat([df,temp_df],axis=1)
    return df

In [12]:
# Dummy process
dummy_list = ['Profit ($M)','Assets ($M)','Market Value ($M)','Employees']
df_all_final = create_buckets(df_all_dropped,dummy_list)

# Results from each step
print('The shape of original df_all is :',df_all.shape)
print('The shape of cleaned df_all is : ',df_all_cleaned.shape)
print('The shape of dropped df_all(quantiles) is : ',df_all_dropped.shape)
print('The shape of final df_all is : ',df_all_final.shape)

The shape of original df_all is : (4000, 14)
The shape of cleaned df_all is :  (3774, 14)
The shape of dropped df_all(quantiles) is :  (2755, 14)
The shape of final df_all is :  (2755, 34)


In [13]:
# Dummy process
dummy_list = ['Profits ($M)','Market Value ($M)','Employees','Assets ($M)',\
             'Total Stockholder Equity ($M)','Earnings Per Share ($)']
df_detail_final = create_buckets(df_detail_dropped,dummy_list)

# Results from each step
print('The shape of original df_all is :',df_detail.shape)
print('The shape of cleaned df_all is : ',df_detail_cleaned.shape)
print('The shape of dropped df_all(quantiles) is : ',df_detail_dropped.shape)
print('The shape of final df_all is : ',df_detail_final.shape)

The shape of original df_all is : (1000, 23)
The shape of cleaned df_all is :  (924, 23)
The shape of dropped df_all(quantiles) is :  (611, 23)
The shape of final df_all is :  (611, 53)


In [14]:
# Output the csv files
df_all_final.to_csv('Fortune500_cleaned.csv',index=False)
df_detail_final.to_csv('Fortune500-2_cleaned.csv',index=False)