# Data: hotcars.csv

 HotCar reporting data. Each hot car report is from a tweet mentioning a single valid 4 digit Metro car number.

 - *car_number*: Metro car number
 - *color*: Line color
 - *time*: Tweet time (UTC)
 - *text*: Tweet text
 - *handle*: Twitter user's screen name
 - *user_id*: Twitter user's user_id
 - *tweet_id*



In [124]:
import pandas as pd

In [125]:
data = pd.read_csv('data/hotcars.csv')

In [126]:
data.head()

Unnamed: 0,car_number,color,time,text,handle,user_id,tweet_id
0,1001,RED,2013-05-28T12:39:54+00:00,"Was just on metro hot car #1001 (red line), an...",CarChickMWB,18249348,339360382302949376
1,1188,RED,2013-05-28T12:50:53+00:00,@FixWMATA @unsuckdcmetro @wmata RL #HotCar 118...,DiavoJinx,403520304,339363145795653632
2,1068,GREEN,2013-05-28T21:06:20+00:00,"oh good, another hot car on the metro. green l...",lexilooo,16174883,339487827479908353
3,2066,ORANGE,2013-05-28T21:15:23+00:00,#HotCar 2066 on OL to New Carrollton. Air is ...,TheHornGuy,40506740,339490107864276992
4,1043,BLUE,2013-05-28T22:08:07+00:00,Car 1043 on the blue line heading to Largo is ...,jessydumpling,263763147,339503377618718720


# Cleaning the Data
1. Rename 'time' column to 'time_stamp'
    - break down the 'time_stamp' into columns ('time', 'full_date', 'year', 'month', 'date')
   
   
2. Delete 'tweet_id' : it doesn't provide any useful info

3. Add news columns 'weekday', 'season', 'weekday' and use 'time_stamp' to generate these values
    - 'workday'
        - 0 = not a workday
        - 1 = a workday
    - 'season' 
        - Winter = 1
        - Spring = 2
        - Summer = 3
        - Autumn = 4
    - 'weekday' 
        - 0 = Monday
        - 1 = Tuesday
        - 2 = Wednesday
        - 3 = Thursday
        - 4 = Friday
        - 5 = Satuday
        - 6 = Sunday

In [127]:
# drops the 'tweet_id' column
data = data.drop(columns = {'tweet_id'})

In [128]:
# renaming 'time' into 'time_stamp'
data = data.rename(columns = {'time' : 'time_stamp'})

In [129]:
# import datetime librarby to convert objects to datetimes
from datetime import datetime

# changes 'time_stamp' to datetime
data['time_stamp']= pd.to_datetime(data['time_stamp']) 

In [130]:
# creating new columns 
data['full_date'] = ''
data['year'] = ''
data['month'] = ''
data['day'] = ''
data['season'] = ''

data['weekday'] = ''  # this will be what day it is (Mon-Sun)
data['season'] = ''   # this will be what season (winter = 1, spring = 2, summer = 3, fall = 4)
data['workday'] = ''  # if it's a workday or not (0 = not a workday, 1 = a workday)

In [160]:
# fill in a 'full_date' based on 'time_stamp'
data.full_date = data.time_stamp.dt.date
# turning 'full_date' into a datetime
data['full_date'] = pd.to_datetime(data['full_date'])

# fills in 'weekday' based on 'time_stamp' 
data.weekday = data.time_stamp.dt.dayofweek

# fill in 'year' column based on 'time_stamp'
data.year = data.time_stamp.dt.year

# fill in 'month' column based on 'time_stamp'
data.month = data.time_stamp.dt.month

# fill in 'season' column based on 'time_stamp'
data.season = data.time_stamp.dt.quarter

# fill in 'day' column based on 'time_stamp'
data.day = data.time_stamp.dt.day

In [132]:
import numpy as np

# fills in the 'workday' column, based on the weekday column
data['workday'] = np.where(data['weekday']>=4, 1 , 0 )

# Final Clean Data

 - *car_number*: Metro car number
 - *color*: Line color
 - *time_stamp*: Tweet time (UTC)
 - *text*: Tweet text
 - *handle*: Twitter user's screen name
 - *user_id*: Twitter user's user_id
 - *full_date* : yyyy/mm/dd
 - *year* : year
 - *month* : month of year
 - *day* : day of month
 - *season* : what season it is
 - *weekday* : Monday - Sunday 
 - *workday* : If it's a workday or not

In [133]:
 data.head()

Unnamed: 0,car_number,color,time_stamp,text,handle,user_id,full_date,year,month,day,season,weekday,workday
0,1001,RED,2013-05-28 12:39:54+00:00,"Was just on metro hot car #1001 (red line), an...",CarChickMWB,18249348,2013-05-28,2013,5,28,2,1,0
1,1188,RED,2013-05-28 12:50:53+00:00,@FixWMATA @unsuckdcmetro @wmata RL #HotCar 118...,DiavoJinx,403520304,2013-05-28,2013,5,28,2,1,0
2,1068,GREEN,2013-05-28 21:06:20+00:00,"oh good, another hot car on the metro. green l...",lexilooo,16174883,2013-05-28,2013,5,28,2,1,0
3,2066,ORANGE,2013-05-28 21:15:23+00:00,#HotCar 2066 on OL to New Carrollton. Air is ...,TheHornGuy,40506740,2013-05-28,2013,5,28,2,1,0
4,1043,BLUE,2013-05-28 22:08:07+00:00,Car 1043 on the blue line heading to Largo is ...,jessydumpling,263763147,2013-05-28,2013,5,28,2,1,0


### Export our clean df to a new csv file for later use

In [209]:
# creating an output file path
outputFile=os.path.join('C:\\Users\\606569\\Documents\\my_GitHub\\D.C-metro-analysis\\data\\cleaned\\hotcars_clean.csv')

#exporting our clean data to a new csv
data.to_csv(outputFile)