# UFO Data Cleaning

In [None]:
#import file as csv
import os
import pandas as pd

file_path = "Resources/ufo_sightings_scrubbed.csv"
ufo_file = pd.read_csv(file_path)

ufo_file.head()
ufo_file["country"].value_counts()

In [None]:
#only look at US for country

ufo_file = ufo_file.dropna(how='any')

#ufo_file["country"].value_counts()

ufo_file = ufo_file.loc[ufo_file['country']== 'us',:]

ufo_file["country"].value_counts()


In [None]:
# ufo_file.head()

In [None]:
ufo_file.to_csv("Resources/ufo_clean.csv")

# Military Base Data Cleaning

In [None]:
import os
import pandas as pd

file_path = "Resources/Military_Bases.csv"
military_file = pd.read_csv(file_path)

In [None]:
military_file['COMPONENT'].value_counts()

In [None]:
military_file=military_file.loc[military_file['COUNTRY']=='United States',:]

In [None]:
military_file['COUNTRY'].value_counts()

In [None]:
military_file.to_csv('Resources/Military_Clean.csv')

# Duration v. State Bar Charts

In [None]:
%matplotlib notebook

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

In [None]:
file_path = ("Resources/ufo_clean.csv")
df = pd.read_csv(file_path)
# df.head()

In [None]:
ufo_df = df.rename(columns = {"duration/n(seconds)":"duration (seconds)"})

ufo_df = ufo_df[["state","duration (seconds)"]]
# ufo_df.head()

In [None]:
ufo_state = ufo_df.groupby("state")["duration (seconds)"].median()
ufo_state = ufo_state.reset_index()
ufo_state = ufo_state.rename(columns = {"duration (seconds)":"median duration (seconds)"})
ufo_state["median duration (seconds)"] = ufo_state["median duration (seconds)"]
ufo_state["median duration (mins)"] = (ufo_state["median duration (seconds)"]/60)
# ufo_state.head()

In [None]:
top_states = ufo_state.nlargest(10, "median duration (seconds)")
top_states["state"]= top_states["state"].str.upper()
# top_states.head()

In [None]:
duration = top_states["median duration (mins)"]

states = top_states["state"]


plt.figure()
plt.barh(states, duration,color="chartreuse",edgecolor="black")
plt.yticks(states, states)
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7])
plt.xlabel("Median Duration of Sighting (Minutes)")
plt.ylabel("State Abbrev.")
plt.title("States with the Longest Average UFO Sightings")
plt.savefig("Resources/bar_chart_states.png")

In [None]:
bottom_states = ufo_state.nsmallest(10, "median duration (seconds)")
bottom_states["state"]= bottom_states["state"].str.upper()
# bottom_states.head()

In [None]:
duration = bottom_states["median duration (mins)"]

states = bottom_states["state"]


plt.figure()
plt.barh(states, duration,color="chartreuse",edgecolor="black")
plt.yticks(states, states)
plt.xticks([0, 1, 2, 3, 4, 5, 6, 7])
plt.xlabel("Median Duration of Sighting (Minutes)")
plt.ylabel("State Abbrev.")
plt.title("States with the Shortest Average UFO Sightings")
plt.savefig("Resources/bar_chart_bottom_states.png")

# UFO Shape Pie Chart

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
file_path = "Resources/ufo_sightings_scrubbed.csv"
ufo_file = pd.read_csv(file_path)

ufo_file.head()
ufo_file["country"].value_counts()

In [None]:
ufo_file = ufo_file.dropna(how='any')



ufo_file = ufo_file.loc[ufo_file['country']== 'us',:]

ufo_file["country"].value_counts()

unique_shapes= ufo_file['shape'].unique()
print(unique_shapes)

In [None]:
ufo_file['shape']= ufo_file['shape'].replace({'light': 'light','fireball': 'light',\
                                              'flash': 'light', 'flare': 'light',\
                                              'triangle': 'triangular','diamond': 'triangular',\
                                              'cone': 'triangular','delta': 'triangular',\
                                              'pyramid': 'triangular','circle': 'circular',\
                                              'sphere': 'circular','disk': 'circular',\
                                              'oval':'circular','cigar': 'circular',\
                                              'round': 'circular','changed': 'changing', 'crescent': 'other',\
                                             'hexagon': 'other', 'egg': 'egg/teardrop', 'teardrop':'egg/teardrop', 'cross': 'other',
                                             'chevron': 'other'})

In [None]:
unique_shapes= ufo_file['shape'].unique()
# print(unique_shapes)
shapes=ufo_file['shape'].value_counts()
print(shapes)

In [None]:
plt.pie(shapes, explode = (0.1,0.1,0.1,0,0,0,0,0,0,0), labels=['Light', 'Circular', 'Triangular',\
                         'Other', 'Unknown', 'Formation', 'Changing', 'Egg/Teardrop', 'Rectangle', 'Cylinder' ])
plt.title('UFO Shapes')

plt.savefig('UFO_Shapes_Pie_Chart')
plt.show()

# Heatmap

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import time
from config import gkey
import gmaps
gmaps.configure(api_key=gkey)

In [None]:
file_path = ("Resources/ufo_clean.csv")
df = pd.read_csv(file_path)
df = df.rename(columns = {"latitude":"lat"})
df = df.rename(columns={ df.columns[-1]: "lng" })
df.head()

In [None]:
lat_lon= df[['lat', 'lng']]
locations = lat_lon.astype(float)

In [None]:
fig = gmaps.figure()

heat_layer = gmaps.heatmap_layer(locations, dissipating=False, max_intensity=0, point_radius=1)
fig.add_layer(heat_layer)
fig

# Military Base v. UFO Sightings Scatter

In [None]:
%matplotlib notebook

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import linregress

In [None]:
scatterdata = pd.read_csv("scatterplot.csv")
scatterdata.drop(columns= 'Unnamed: 0')

In [None]:
(slope, intercept, _, _, _)= linregress(scatterdata['Frequency of Military'], scatterdata['Frequency of UFO'])

fit = slope * scatterdata['Frequency of Military'] + intercept

In [None]:
fig, ax = plt.subplots()
ax.plot(scatterdata['Frequency of Military'], scatterdata['Frequency of UFO'], linewidth=0, marker = 'o')
ax.plot(scatterdata['Frequency of Military'], fit, 'b--')
ax.set_xlabel("Frequency of Military Bases")
ax.set_ylabel("Frequency of UFO Sightings")
fig.suptitle("Military Base Location vs UFO Sightings", fontsize=16, fontweight="bold")

# State Population v. UFO Sightings whisker

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import scipy.stats as stats

In [None]:
file_path = "ufo_sightings_scrubbed.csv"
ufo_file = pd.read_csv(file_path)
file_path2 = "acs2017_census_tract_data.csv"
census_data = pd.read_csv(file_path2)
ufo_file.head()
file_path3 = "UFO_Sighting_State_Population_data.csv"
combined_ufo_state_data = pd.read_csv(file_path3)

In [None]:
cities = ufo_file[ufo_file['country'] == 'us']
# cities.head()

In [None]:
city_sightings = cities['city'].value_counts()
city_sightings_df = pd.DataFrame(city_sightings)
# city_sightings_df.head(20)

In [None]:
state_sightings = cities['state'].value_counts()
state_sightings_df = pd.DataFrame(state_sightings)
# state_sightings_df.head()

In [None]:
census_data.groupby('State').head()

In [None]:
census_state_population = census_data[['State', 'TotalPop']]
census_state_population.head()

In [None]:
census_state_population_sum = census_state_population.groupby(["State"]).sum()
census_state_population_sum.head()

In [None]:
combined_ufo_state_data.head()


In [None]:
combined_ufo_state_data_df = combined_ufo_state_data[['State','TotalPop','UFO Sightings']]
combined_ufo_state_data_df.head()

In [None]:
sorted_combined_ufo_state_data_df = combined_ufo_state_data_df.sort_values('TotalPop')
sorted_combined_ufo_state_data_df.head()

In [None]:
sorted_combined_ufo_state_data_df['Sightings per Pop'] = sorted_combined_ufo_state_data_df['UFO Sightings'] / sorted_combined_ufo_state_data_df['TotalPop'] 
sorted_combined_ufo_state_data_df['Sightings per Pop * 10K'] = sorted_combined_ufo_state_data_df['Sightings per Pop'] * 10000
sorted_combined_ufo_state_data_df.head()

In [None]:

sorted_combined_ufo_state_data_df.sort_values('Sightings per Pop * 10K').head()

In [None]:
sorted_combined_ufo_state_data_df.sort_values('Sightings per Pop * 10K', ascending=False).head()

In [None]:
size_classification = [0, 4000000, 8000000, 40000000]
size_labels = ["Small", "Medium", "Large"]
sorted_combined_ufo_state_data_df['Size Classification'] = pd.cut(sorted_combined_ufo_state_data_df['TotalPop'], size_classification, labels=size_labels)

In [None]:
sorted_combined_ufo_state_data_df

In [None]:
size_group = sorted_combined_ufo_state_data_df.groupby('Size Classification')
size_counts = size_group['Size Classification'].count()
size_counts

In [None]:
sorted_combined_ufo_state_data_df['TotalPop'].astype('int')
sorted_combined_ufo_state_data_df.boxplot('Sightings per Pop * 10K',by='Size Classification', figsize=(12, 8))

In [None]:
group1 = sorted_combined_ufo_state_data_df[sorted_combined_ufo_state_data_df['Size Classification'] == 'Small']['Sightings per Pop * 10K']
group2 = sorted_combined_ufo_state_data_df[sorted_combined_ufo_state_data_df['Size Classification'] == 'Medium']['Sightings per Pop * 10K']
group3 = sorted_combined_ufo_state_data_df[sorted_combined_ufo_state_data_df['Size Classification'] == 'Large']['Sightings per Pop * 10K']

In [None]:
stats.f_oneway(group1, group2, group3)

# Line Chart

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np

file_path = "Resources/ufo_sightings_scrubbed.csv"
ufo_file = pd.read_csv(file_path)
ufo_file.head()

In [None]:
ufo_file = ufo_file.dropna(how='any')
ufo_file = ufo_file.loc[ufo_file['country']== 'us',:]
ufo_file["country"].value_counts()
ufo_file.head()

In [None]:
ufo_file.head()
ufo_file["datetime"].value_counts()

In [None]:
# split datetime column by "/"
split_datetime = ufo_file["datetime"].str.rsplit("/", n=1, expand=True)
# split year and time by " "
split_datetime = split_datetime[1].str.rsplit(" ", n=1, expand=True)
split_datetime.head()

In [None]:
sighting_year = split_datetime.groupby([0]).count()
sighting_year = sighting_year.astype(float)

sighting_year.cumsum()
sighting_year.plot(color='Chartreuse', linestyle='-.', linewidth=5)
plt.xlabel('Year', fontsize=18)
plt.ylabel('Number of Sightings', fontsize=18)
