# Cleaning the data

### In this notebook we clean the data in the all_census_data.csv file

In [71]:
# imports
import pandas as pd
import numpy as np

df = pd.read_csv('all_census_data.csv');

In [69]:
states = {'Alabama': 'AL',
         'Alaska': 'AK',
         'Arizona': 'AZ',
         'Arkansas': 'AR',
         'California': 'CA',
         'Colorado': 'CO',
         'Connecticut': 'CT',
         'Delaware': 'DE',
         'Florida': 'FL',
         'Georgia': 'GA',
         'Hawaii': 'HI',
         'Idaho': 'ID',
         'Illinois': 'IL',
         'Indiana': 'IN',
         'Iowa': 'IA',
         'Kansas': 'KS',
         'Kentucky': 'KY',
         'Louisiana': 'LA',
         'Maine': 'ME',
         'Maryland': 'MD',
         'Massachusetts': 'MA',
         'Michigan': 'MI',
         'Minnesota': 'MN',
         'Mississippi': 'MS',
         'Missouri': 'MO',
         'Montana': 'MT',
         'Nebraska': 'NE',
         'Nevada': 'NV',
         'New Hampshire': 'NH',
         'New Jersey': 'NJ',
         'New Mexico': 'NM',
         'New York': 'NY',
         'North Carolina': 'NC',
         'North Dakota': 'ND',
         'Ohio': 'OH',
         'Oklahoma': 'OK',
         'Oregon': 'OR',
         'Pennsylvania': 'PA',
         'Rhode Island': 'RI',
         'South Carolina': 'SC',
         'South Dakota': 'SD',
         'Tennessee': 'TN',
         'Texas': 'TX',
         'Utah': 'UT',
         'Vermont': 'VT',
         'Virginia': 'VA',
         'Washington': 'WA',
         'West Virginia': 'WV',
         'Wisconsin': 'WI',
         'Wyoming': 'WY'}

In [63]:
# Clean Data

def clean_nan(x):
    """
    Cleans all columns in dataset that have missing data in the rows
    """
    
    if type(x) == str:
        if '-' in x:
            return 0
        elif '(X)' in x:
            return np.nan
        #otherwise...
    return x


def clean_names(x):
    """
    clears city names of city town village borough
    """
    
    if 'city' in x:
        x = x.replace('city', '')
    elif 'town' in x:
        x = x.replace('town', '')
    elif 'village' in x:
        x = x.replace('village', '') 
    elif 'borough' in x:
        x = x.replace('borough', '')
        
    x = x.replace(' ,',',') # fixes 'cityname , state' to 'cityname, state'
    return x

def state_abrev(name):
    """
    Replaces the state in the name column to abbreviation 
    """
    
    for x in states:
        if x in name:
            name = name.replace(x,states[x])
    return name

def wrangle(X):
    """
    This function will clean and restructure the dataset
    """
    X = X.drop(index = 0) # drop first row
    X['NAME'] = X['NAME'].apply(state_abrev)
    X['NAME'] = X['NAME'].apply(clean_names)
    for column_name in df.columns:
        X[column_name] = X[column_name].apply(clean_nan)
    
    return X

In [64]:
# Apply wrangle function to datasetW
clean_df = wrangle(df)

clead_df.head()

Unnamed: 0.1,Unnamed: 0,GEO_ID,NAME,DP02_0001E,DP02_0001M,DP02_0001PE,DP02_0001PM,DP02_0002E,DP02_0002M,DP02_0002PE,...,DP05_0087PE,DP05_0087PM,DP05_0088E,DP05_0088M,DP05_0088PE,DP05_0088PM,DP05_0089E,DP05_0089M,DP05_0089PE,DP05_0089PM
1,1,1600000US0167056,"Russellville, AL",3208,282,3208,,2195,213,68.4,...,5370,,2750,397,51.2,4.3,2620,260,48.8,4.3
2,2,1600000US0178984,"Vina, AL",146,37,146,,101,30,69.2,...,267,,136,43,50.9,6.8,131,44,49.1,6.8
3,3,1600000US0162688,"Providence, AL",101,29,101,,62,25,61.4,...,204,,108,43,52.9,8.9,96,30,47.1,8.9
4,4,1600000US0131096,"Grant, AL",367,58,367,,273,50,74.4,...,798,,356,81,44.6,4.7,442,106,55.4,4.7
5,5,1600000US0119816,"Daviston, AL",93,29,93,,77,22,82.8,...,203,,113,38,55.7,9.4,90,31,44.3,9.4


In [70]:
# Now we need to check to see if the top populated cities are within the dataset
df_pop_top = pd.read_csv('top_1000_cities_US.csv')

# Get all city names into a list without state
city_names = []
for name in clean_df['NAME']:
    temp = str(name).split(',') # Not sure why I am needing to cast to str type: for some reason name column is int type
    city_names.append(temp[0])
    
# Now we are able to check if the top 1000 cities are within our data
city_not_in_data = []
for name in df_pop_top['City Name']:
    if name not in city_names:
        city_not_in_data.append(name)
        
print('The amount of cities not found:', len(city_not_in_data))
print('Phoenix' in city_names)
print('Phoenix city' in city_names)

The amount of cities not found: 27
True
False
