In [1]:
import datetime
import tarfile
import json
import bz2
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pyspark.sql.types import *
from pyspark.sql.functions import udf, desc, min, max, to_date, date_format, expr, hour, year, month, dayofweek, count
from pyspark.sql import functions as F #module that includes a variety of functions like to extract features

In [3]:
sc

### Loading csv data file to pyspark

In [7]:
# defining schema per column, header and data type
schema = StructType([
    StructField("index",IntegerType(), nullable=True),
    StructField("id", IntegerType(), nullable=True),
    StructField("date", DateType(), nullable=True),
    StructField("flag", StringType(), nullable=True),
    StructField("user", StringType(), nullable=True),
    StructField("text", StringType(), nullable=True)
])

#reading in the file with the schema defined above
tweetsDf= spark.read.csv('hdfs://localhost:9000/user1/ProjectTweets.csv', schema=schema, header=False).withColumn("date", to_date("date"))

#printing schema and head of the table
tweetsDf.printSchema();tweetsDf.show()

root
 |-- index: integer (nullable = true)
 |-- id: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- flag: string (nullable = true)
 |-- user: string (nullable = true)
 |-- text: string (nullable = true)

+-----+----------+----+--------+---------------+--------------------+
|index|        id|date|    flag|           user|                text|
+-----+----------+----+--------+---------------+--------------------+
|    0|1467810369|null|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|    1|1467810672|null|NO_QUERY|  scotthamilton|is upset that he ...|
|    2|1467810917|null|NO_QUERY|       mattycus|@Kenichan I dived...|
|    3|1467811184|null|NO_QUERY|        ElleCTF|my whole body fee...|
|    4|1467811193|null|NO_QUERY|         Karoli|@nationwideclass ...|
|    5|1467811372|null|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|    6|1467811592|null|NO_QUERY|        mybirch|         Need a hug |
|    7|1467811594|null|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|    8|14

In [6]:
pd.read_csv('ProjectTweets.csv')

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,1,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,2,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,3,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,4,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,5,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
...,...,...,...,...,...,...
1599994,1599995,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599995,1599996,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599996,1599997,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599997,1599998,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
