In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
df = pd.read_csv("data/ufo.csv", error_bad_lines=False)

In [None]:
ufo_data = df.copy()

# Peek The Data

In [None]:
ufo_data.head()

In [None]:
ufo_data.shape

In [None]:
ufo_data.columns

In [None]:
ufo_data.dtypes

# Data Wrangling & Tidying Up

In [None]:
ufo_data.head()

In [None]:
ufo_data.dtypes

In [None]:
# Null Values (% of entries)
nulvals = ufo_data.isnull().sum()
nulpct = (nulvals / len(ufo_data))*100
print(round(nulpct.sort_values(ascending=False),2))

#Country -> unknown
#State -> unknown
#duration (hours/min) -> drop column
#duration (seconds) -> dropna
#shape -> dropna
#comment -> dropna

In [None]:
ufo_data.isnull().sum()

In [None]:
#Country -> unknown
print(ufo_data.country.unique())
ufo_data["country"] = ufo_data["country"].fillna("unknown")
print(ufo_data.country.unique())

In [None]:
#State -> unknown
print(ufo_data.state.unique())
ufo_data["state"] = ufo_data["state"].fillna("unknown")
print(ufo_data.state.unique())


In [None]:
#duration (hours/min) -> drop column
del ufo_data['duration (hours/min)']

In [None]:
#duration (seconds) -> dropna
#shape -> dropna
#comment -> dropna
ufo_data = ufo_data.dropna(subset=['duration (seconds)', 'shape', 'comments'])

In [None]:
ufo_data.isnull().sum()

In [None]:
ufo_data.shape

In [None]:
#transform column

In [None]:
ufo_data.head()

In [None]:
ufo_data.dtypes
#datetime -> datetime
#year -> add new
#month -> add new
#monthday -> add new
#weekday -> add new
#year_posted -> add new
#date posted -> drop

In [None]:
#datetime -> datetime

ufo_data.datetime = ufo_data.datetime.replace({'24:00':'00:00'}, regex=True)
ufo_data.datetime = pd.to_datetime(ufo_data['datetime'])
ufo_data.dtypes

In [None]:
#year -> add new
#month -> add new
#monthday -> add new
#weekday -> add new
ufo_data['year'] = ufo_data['datetime'].dt.year
ufo_data['month'] = ufo_data['datetime'].dt.month
ufo_data['monthday'] = ufo_data['datetime'].dt.day
ufo_data['weekday'] = ufo_data['datetime'].dt.weekday
ufo_data.head()

In [None]:
#year_posted -> add new
#date posted -> drop

def grep_year(x):
    x = x.split(" ")[0]
    x = x.split("/")[2]
    x = int(x)
    return x

ufo_data['year_posted'] = ufo_data['date posted'].apply(grep_year)
del(ufo_data['date posted'])
ufo_data.head()

In [None]:
# Adding Season Column
def conv_season(x):
    if x in range(3,6):
        return "Spring"
    if x in range(6,9):
        return "Summer"
    if x in range(9,12):
        return "Autumn"
    if x == 12 or x == 1 or x == 2:
        return "Winter"

ufo_data["season"] = ufo_data['month'].apply(conv_season)
ufo_data.sample(15)

In [None]:
#Latitude Langitude
ufo_data['latitude'] = pd.to_numeric(ufo_data['latitude'],errors = 'coerce')  # latitudes as numerics
ufo_data['longitude'] = pd.to_numeric(ufo_data['longitude'], errors='coerce')

ufo_data['duration (seconds)'] = pd.to_numeric(ufo_data['duration (seconds)'], errors='coerce')

In [None]:
ufo_data.isnull().sum()

In [None]:
ufo_data = ufo_data.dropna(subset=['duration (seconds)', 'latitude', 'longitude'])

# Data Exploration

In [None]:
#histogram
ufo_data.hist(figsize=(13, 11))
plt.show()

In [None]:
#correlation coefficicent matrix

plt.figure(figsize=(15,10))
sns.heatmap(ufo_data.corr(), annot=True, linewidths=.20, fmt='.3f')
plt.title('Correlation between different features')

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(20,5))

sns.distplot(ufo_data['year'], ax=ax[0])
sns.distplot(ufo_data['month'], ax=ax[1])
sns.distplot(ufo_data['monthday'], ax=ax[2])

In [None]:
# Ufo Sighting Frequency
print(ufo_data.year.min(), ufo_data.year.max())

In [None]:
# Ufo Sighting Frequency (1966-2014)

## Set axes ##
years_data = ufo_data.year.value_counts()
years_index = years_data.index  # x ticks
years_values = years_data.get_values()

## Create Bar Plot ##
plt.figure(figsize=(15,8))
plt.xticks(rotation = 60)
plt.title('UFO Sightings by Year')

years_plot = sns.barplot(x=years_index[:40],y=years_values[:40], palette = "GnBu")

In [None]:
# Where most UFO sightings occur?
ufo_data.country.value_counts()

In [None]:
# Where most UFO sightings occur?
country_sightings = ufo_data.country.value_counts()

explode = (0, 0, 0, 0, 0., 0.05)
colors = ['lightblue','gold','yellowgreen','lightcoral','orange']
country_sightings.plot(kind = 'pie', fontsize = 0, title='UFO Sightings by Country', colors=colors,
                       autopct='%1.1f%%',shadow=True, explode=explode,figsize=(8,8))
plt.legend(labels=['United States','Unknown','Canada','United Kingdom','Australia','Germany'], loc="best")
plt.tight_layout()

Could mean:
1. A large amount of Americans are part reptilian alien
2. The US government is in cahoots with extra-terrestrials and is using their technology to maintain their status as a global superpower

In [None]:
# UFO Sightings by US State

In [None]:
ufo_data['country'].value_counts(normalize=True)

In [None]:
# Where do UFO Sightings occur most within the United States?
usa_filter = ufo_data['country']=='us'
us_data = ufo_data[usa_filter]

states_sights = us_data.state.value_counts()  # State Data
state_names = states_sights.index  # x axis ticks
state_freq = states_sights.get_values()  # y axis values

plt.figure(figsize=(15,8))
plt.xticks(rotation = 60)
plt.title('Total UFO Sightings by State')
states_plot = sns.barplot(x=state_names,y=state_freq, palette="GnBu_r")
plt.show()
print('Top 10 States for Total UFO Sightings:')
print(states_sights[:10].sort_values(ascending=False))

In [None]:
# Measure UFO Sighting to state population
statespop = {'al':4872725.,'ak':746079.,'az':7044577.,'ar':2998643.,'ca':39506094.,
            'co':5632271.,'ct':3568174.,'de':960054.,'dc':691963.,'fl':20979964.,
            'ga':10421344.,'hi':1431957.,'id':1713452.,'il':12764031.,'in':6653338.,
            'ia':3147389.,'ks':2907857.,'ky':4449337.,'la':4694372.,'me':1333505.,
            'md':6037911.,'ma':6839318.,'mi':9938885.,'mn':5557469.,'ms':2988062.,
            'mo':6109796.,'mt':1052967.,'ne':1920467.,'nv':2996358,'nh':1339479.,
            'nj':8953517.,'nm':2081702.,'ny':19743395.,'nc':10258390.,'nd':759069.,
            'oh':11623656.,'ok':3939708.,'or':4162296.,'pa':12776550.,'pr':3661538.,
            'ri':1057245.,'sc':5027404.,'sd':872989.,'tn':6707332.,'tx':28295553.,
            'ut':3111802.,'vt':623100.,'va':8456029.,'wa':7415710.,'wv':1821151.,
            'wi':5789525.,'wy':584447.} 
states_pop = pd.Series(statespop)  # turn dict into series type


state_propsight = (states_sights / states_pop)*100 # prop data series for viz, scaled for style
state_propsight = state_propsight.sort_values(ascending=False) 

# Visualize it
us_namesp = state_propsight.index  # x ticks
us_sightsp = state_propsight.get_values()  # y values

plt.figure(figsize=(15,8))
plt.xticks(rotation=60)
plt.title('State UFO Sightings Relative to Population')
sns.barplot(x = us_namesp[:50], y = us_sightsp[:50], palette="GnBu_r")
plt.show()
print('States with Highest Proportion of UFO Sightings:')
print(round(state_propsight[:10],2))

In [None]:
#Is there a time during the year when I'm most likely to see a UFO?
m_cts = (ufo_data['month'].value_counts()).sort_index()
m_ctsx = m_cts.index
m_ctsy = m_cts.get_values()
f, ax = plt.subplots(figsize=(15,8))

sns.barplot(x=m_ctsx, y=m_ctsy, palette="YlGnBu")
ax.set_title('Global UFO Sightings by Month')
ax.set_xlabel('Month')
ax.set_ylabel('# Sightings')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Which season was sighted
ufo_data['season'].value_counts().plot('bar')
plt.ylabel("Frequency")
plt.xticks(rotation=0)
plt.title("Sight - Season")
plt.show()

Typically, I see UFOs in the summer. This may be because I'm outside more often in the summer, or it could be because UFO activity is heightened during the summer. We can group UFO sightings by location in order to see if this makes a difference (above the equator vs. below the equator).

In [None]:
#How long
plt.subplots(figsize=(22,8))
duration_sec = [i for i in ufo_data["duration (seconds)"].value_counts()]
duration_sec_list = []
for i in duration_sec:
    if i in range(0,16):
        duration_sec_list.append("0-15")
    if i in range(15,31):
        duration_sec_list.append("15-30")
    if i in range(31,61):
        duration_sec_list.append("30-60")
    if i in range(60,121):
        duration_sec_list.append("60-120")
    if i in range(120,241):
        duration_sec_list.append("120-240")
    if i > 240:
        duration_sec_list.append(">240")
duration_sec_list = pd.Series(duration_sec_list)
di = duration_sec_list.value_counts().index
dv = duration_sec_list.value_counts().values
sns.barplot(di,dv)

plt.xlabel("Time - Seconds",fontsize=24)
plt.xticks(fontsize=20)
plt.ylabel("Rates",fontsize=24)
plt.yticks(fontsize=20)

plt.show()

In [None]:
#Shape
plt.subplots(figsize=(18,8))

ufo_data['shape'].value_counts().plot('bar')
plt.xticks(rotation=45, fontsize=15)
plt.show()

In [None]:
#Sight years and post years
plt.subplots(figsize=(22,10))

plt.subplot(2,1,1)
plt.title("Sight rates by years")
ufo_data['year'].value_counts().plot('bar')
plt.xlabel("Years")
plt.subplots(figsize=(22,10))

plt.subplot(2,1,2)
plt.title("Posting the case's rates by years")
ufo_data['year_posted'].value_counts().plot('bar')
plt.ylabel("Post Year")
plt.xticks(rotation=0)
plt.show()

In [None]:
words = [i for i in ufo_data.comments.dropna()]
    
words = " ".join(words)

plt.subplots(figsize=(28,12))
wordcloud = WordCloud(
                          background_color='white',
                          width=2048,
                          height=1024
                          ).generate(words)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('graph.png')

plt.show()