# Data Encoding

In [206]:
# Packages for loading, cleaning, visualization, and analysis

# Data
import pandas as pd
import numpy as np
import scipy as sp
import os
import string as st




### In Class Exercise  -  Save the Cleaned Data Frame

Copy the cells from your Data Engineering 3 - Data Cleaning notebook to this notebook. Only copy the cells that you use to clean the data (i.e., if you use imputation rather than row deletion copy the imputation cells and not the row deletion cells). 

Run the code in the cells and then save your results, the cleaned data frame, to your local or cloud storage. 

Read the data to validate that you correctly saved your data.

In [207]:
# I have my own function that will pull each year of the train data in a panda df
# You pass a directory and it returns a list of pandas dataframes
def getListDataFrames(path):
    # initialize an empty list
    csvs = list()
    # Get all the files in the directory
    all_files = os.listdir(path)
    # Iterate across each of the files in the Data folder
    for fileName in all_files:
        # Get the full path name
        fileToPull = path + fileName
        # Read in the data at that location
        filesOfInterest = pd.read_csv(fileToPull, low_memory = False, encoding='ISO-8859-1')
        csvs.append(filesOfInterest)
    return csvs

# Here is the list of 16 pandas dataframes corresponding to 2001 thru 2016
listOfAccidents = getListDataFrames('/Users/mead/Fall2017/DonBrown-DS6001/InClass1/Data/')

#### Concatenate the pandas together

In [208]:
acc_df = pd.concat(listOfAccidents, ignore_index = True)
acc_df.shape

(51623, 154)

#### Combine the Narratives

In [209]:
# # 1
# # Produce a function to add all of the Narratives together
# # Takes the index of the NARR1 column as input alongside the dataframe of interest. 
# # It outputs the full Narrative column
# def combineNARR(NARR1index, dataFrme):
# # Initialize an empty list of all narratives
#     allAccs = list()
#     # Iterate across every row
#     for acc in range(len(dataFrme.index)):
#         # Initialize an empty list of each row's narrative
#         fullNarr = list()
#         # Iterate across every NARR column
#         for narr in range(15):
#             index = narr + NARR1index[0]
#             # Keep track of the value in that NARR column for that row
#             narrElement = str(dataFrme.iloc[acc, index])
#             fullNarr.append(narrElement)
#         allAccs.append(fullNarr)
#     return allAccs

# # This is the which function which keeps track of where NARR1 is
# which = lambda lst:list(np.where(lst)[0])
# NARR1index = which([name == 'NARR1' for name in acc_df.columns])

# # Run the function -- output is a list of lists
# newColumn = combineNARR(NARR1index, acc_df)

# # 2
# # Combine all the NARR together for each row. Make sure to remove the NAs
# Narratives = [''.join([sentence for sentence in NARR if sentence != 'nan']) for NARR in newColumn]
# # Add this as a new column for the accidents df
# acc_df['Narrative'] = pd.Series(Narratives, index=acc_df.index)

In [210]:
# Join the narrative and put them in a list

def join_narratives(DF):
    '''With the input of the accident dataframe
    merge the narrative columns into a single narrative
    and return a list of these single narratives for each
    accident report in the dataframe. '''
    narrlist = []
    for i in range(0,15):
        a = str(i+1)
        narrlist.append('NARR'+ a)
    RailNarr = DF.loc[:, narrlist]
    Narratives = []
    for i, _ in enumerate(RailNarr["NARR1"]):
#         print(i)
        NarrativeList = RailNarr.iloc[i]
        Anarrative = ""
        for narr in NarrativeList:
            if pd.isnull(narr):
                break
            else:
                if(i == 4320):
                    print(narr)
                Anarrative += str(narr.encode('ascii', 'ignore').decode('ascii'))
        Narratives.append(Anarrative)
    return Narratives

newColumn = join_narratives(acc_df)

# Add this as a new column for the accidents df
acc_df['Narrative'] = newColumn

CP TRAIN 292-16 (86 LOADS, 26 EMPTIES, 12,342 EGT) INCLUDING 15 LOADS ANHYDROUS AMMONIA; 10 LOADS LP
G AND 11 LOADS STYRENE MONOMER INHIBITED, DERAILED 31 CARS, ALL ON THEIR SIDES, RESULTING IN HAZMAT
RELEASE FROM ELEVEN CARS (SEE APPENDED LIST) RESULTING IN SPILL OF APPROXIMATELY 590 TONS.  EVACUATI
ON OF APPROXIMATELY 20 HOMES IN A HALF-MILE RADIUS IMMEDIATELY EAST OF THE DERAILMENT SITE AND REMAI
NS IN EFFECT AS OF 2/27/02.  ONE FATALITY TO A MINOT, ND RESIDENT RESULTED DURING VOLUNTARY EVACUATI
ON.  APPROXIMATELY 1,000-2,000 PEOPLE WERE SEEN AND EVALUATED AT HOSPITALS AND CLINICS WITH SOME BEI
NG TREATED AND/OR ADMITTED INTO THE HOSPITAL.   EMERGENCY RESPONSE DISPATCHED, COMPANY PERSONNEL AS
WELL AS NTSB AND FRA PERSONNEL AT SITE.  STATIONARY COMMAND CENTER SET UP AT LOCAL FIRE STATION AND
A MOBILE COMMAND CENTER NEAREST DERAILMENT SITE HAS BEEN SET UP.  FIRE DEPARTMENT RESPONDED AND HOSE
D DOWN HAZMAT CARS.  SPAN OF DERAILMENT SITE TOTALED 475 FEET.  SINGLE MAIN CWR TRACK, CLASS 3

#### Drop old NARR columns

In [211]:
# Drop old narrative columns
narrList = []
for i in range(0,15):
    a = str(i + 1)
    narrList.append('NARR' + a)
acc_df.drop(narrList, axis = 1, inplace = True)

#### Removing the duplicates

In [212]:
# Removing the duplicates
acc_clean_df = acc_df[(acc_df['JOINTCD'] == 1) & (acc_df['TYPE'] != 7)]

In [213]:
acc_df.shape

(51623, 140)

In [214]:
acc_clean_df.shape

(38167, 140)

In [215]:
acc_clean_df = acc_clean_df.dropna(axis = 1, thresh = (38167-1500))

In [216]:
acc_clean_df.shape

(38167, 88)

In [217]:
# This class imputes the missing values as
# (1) most frequent if the variable is categorical 
# (2) mean if the variable is real (floating point)
# (3) median if the variable is an integer 

# Here is a class that will provide imputation
# This is an extension by D.Brown to sveitser, 2014 https://stackoverflow.com/users/469992/sveitser

from sklearn.base import TransformerMixin

class DataFrameImputer(TransformerMixin):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.
        
        Columns of dtype floating point are imputed with the mean.

        Columns of other types are imputed with median of the column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0]
            if X[c].dtype == np.dtype('O') 
                               else X[c].mean() if X[c].dtype == np.dtype('f')
                                else X[c].median() for c in X],
            index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)


In [218]:
acc_clean_df = DataFrameImputer().fit_transform(acc_clean_df)

In [219]:
sum(acc_clean_df.isnull().sum())

0

In [220]:
# REPLACING CATEGORICAL VARIABLES
# Now to actually go in and make replacements for all of these values 
# you can go in and use .replace()
acc_clean_df["TYPE"] = acc_clean_df['TYPE'].replace(range(1,14),['Derailment', 
    'Head on collision', 'Rearend collision', 'Side collision', 'Raking collision', 
    'Broken train collision', 'Hwy-rail crossing', 'RR Grade Crossing', 'Obstruction', 
    'Explosive-detonation', 'fire/violent rupture', 'Other impacts', 
    'Other (described in narr)'])
# Or you can go in and use a dictionary
#map_typeq = {1:'Freight', '1':'Freight'}
#acc_clean_df['TYPEQ'] = acc_clean_df.map(map_typeq)
print("Types: \n" + str(acc_clean_df["TYPE"].value_counts()))

# Doing the same thing  but for weather
acc_clean_df["WEATHER"] = acc_clean_df['WEATHER'].replace(range(1,7),['clear', 
    'cloudy', 'rain', 'fog', 'sleet', 'snow'])
print("\nWeather: \n" + str(acc_clean_df["WEATHER"].value_counts()))

Types: 
Derailment                   27429
Other impacts                 4321
Other (described in narr)     2127
Side collision                1603
Obstruction                    991
Raking collision               720
fire/violent rupture           454
Rearend collision              298
Head on collision              125
Broken train collision          80
Explosive-detonation            14
RR Grade Crossing                5
Name: TYPE, dtype: int64

Weather: 
clear     25307
cloudy     8757
rain       2738
snow        856
fog         431
sleet        78
Name: WEATHER, dtype: int64


In [221]:
# Can't use CSV because of commas in the Narrative

In [222]:
print(acc_clean_df.index)

Int64Index([    0,     1,     2,     3,     4,     5,     7,     8,     9,
               10,
            ...
            51611, 51612, 51613, 51614, 51616, 51617, 51618, 51619, 51621,
            51622],
           dtype='int64', length=38167)


In [224]:
# Now save the Narrative
# First create dictionary of Narratives
# Note: JSON doesn't allow integer values for indices

str_index = [str(x) for x in acc_clean_df.index]
Narrative_dict = dict(zip(str_index, acc_clean_df.Narrative))

In [225]:
len(Narrative_dict)

38167

In [226]:
acc_clean_df.index[878]

1196

In [227]:
Narrative_dict['1196']

'CREW MEMBER FAILED TO LINE SWITCH OF THE CROSSOVER, TRAILED THROUGH THE SWITCH RESULTING IN A DERAILMENT WHEN MOVEMENT WAS REVERSED.'

In [228]:
import json
path = '/Users/mead/Fall2017/DonBrown-DS6001/InClass1/'
file = 'TrainNarratives.txt'
with open(path+file, 'w') as destination:
    json.dump(Narrative_dict, destination)

In [202]:
acc_clean_df.shape

(38167, 87)

In [204]:
acc_clean_df.to_csv('/Users/mead/Fall2017/DonBrown-DS6001/InClass1/FullCleanedAdrian.csv')