In [21]:
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np

ModuleNotFoundError: No module named 'geopandas'

In [None]:
file = "Resources/scrubbed_csv.csv"


ufo_data = pd.read_csv(file)

ufo_data.head()

In [None]:
ufo_clean = ufo_data.dropna()


#print(len(ufo_clean))


ufo_clean = ufo_clean.loc[ufo_clean["country"] == "us"]

print(len(ufo_clean))

ufo_clean.head()

In [None]:
ufo_clean[['month', 'day', 'year']] = ufo_clean['datetime'].str.split('/',expand=True)
ufo_clean[['year', 'time']] = ufo_clean['year'].str.split(' ',expand=True)


In [None]:
ufo_clean

In [None]:
ufo_month = ufo_clean[["city", "state", "shape", "month", "day", "year", "time", "duration (seconds)" ]]

ufo_all = ufo_month.dropna()

#ufo_month["month"].value_counts()

ufo_all.head()

In [None]:
ufo_all['month'] = pd.to_datetime(ufo_all['month'], format='%m').dt.month_name().str.slice(stop=3)

ufo_all

In [None]:
ufo_all["month"].value_counts()

We wanted to see how many sightings occured in each month. July showed the most activity, while February showed the least activity. We expected seasons to have an effect on sightings and that was spot on. Summer showed the most sightings.

In [None]:
ufo_month_only = ufo_all[["month"]]

month_group = ufo_month_only["month"].unique()

ufo_month_count = ufo_month_only.value_counts()

#ufo_month_count = ufo_month_count.sort_values(ascending=True)

month_counts = pd.DataFrame({"Total Sightings": ufo_month_count})
month_counts.set_index(month_group)
month_counts = month_counts.reset_index()

month_counts

ax1 = month_counts.plot(kind="bar", title="Reported Sightings by Month", xlabel="Months", ylabel="Sightings", align="center",
                  color="green", figsize=(10,5), x="month", y="Total Sightings")
ax1.set_ylim(0, 8000)
plt.show()



The year with the most sightings is 2012, we decided to avoid using years with less than 500 sightings because it didn't seem like the sample size was enough. We noticed as the years were more recent there were more sightings. Unfortunately we don't have any hard data to support why there were more reports. But we think it has something to do with availability of cell phones.

In [None]:
ufo_year_only = ufo_all[["year"]]

year_group = ufo_year_only["year"].unique()
year_group

ufo_year_count = ufo_year_only.value_counts()
#ufo_year_count



year_counts = pd.DataFrame({"Total Sightings": ufo_year_count})
year_counts.set_index(year_group)
year_counts = year_counts.reset_index()
year_counts = year_counts[year_counts["Total Sightings"] > 500] 

year_counts.head()

ax1 = year_counts.plot(kind="bar", title="Reported Sightings by Year", xlabel="Years", ylabel="Sightings", align="center", color="green",
                 figsize=(10,5), x="year", y="Total Sightings")
ax1.set_ylim(0, 7000)
plt.show()

In [None]:
ufo_state_only = ufo_all[["state"]]

for state in ufo_state_only.columns:
    ufo_state_only["state"] = ufo_state_only["state"].str.upper() 

state_group = ufo_state_only["state"].unique()

ufo_state_count = ufo_state_only.value_counts()

#ufo_month_count = ufo_month_count.sort_values(ascending=True)

state_counts = pd.DataFrame({"Total Sightings": ufo_state_count})
state_counts.set_index(state_group)
state_counts = state_counts.reset_index()
state_counts = state_counts[state_counts["Total Sightings"] > 15]

state_counts

ax1 = state_counts.plot(kind="bar", title="Reported Sightings by State", xlabel="State", ylabel="Sightings",
                  color="green", figsize=(18,5), x="state", y="Total Sightings")
ax1.set_ylim(0, 9000)
plt.show()

Light has by far the most sightings, which influences the mean duration of the sightings. Triangle has the second most sightings, but a very low sighting duration time. 

In [None]:
ufo_duration = ufo_all[["shape", "duration (seconds)"]]

for shape in ufo_duration.columns:
    ufo_duration["shape"] = ufo_duration["shape"].str.upper() 


ufo_duration["duration (seconds)"] = ufo_duration["duration (seconds)"].astype("float")

# ufo_shape_count = ufo_duration["shape"].value_counts()
# ufo_shape_count

ufo_shape_duration = ufo_duration.groupby(["shape"]).mean().reset_index()
#ufo_shape_duration = ufo_shape_duration["shape"].drop(["changed", "crescent", "delta", "flare", "hexagon", "pyramid", "round"])

#changed, crescent, delta, flare, hexagon, pyramid, round
#0, 6, 9, 14, 17, 21, 23
# ufo_duration.dtypes

ufo_shape_duration = ufo_shape_duration.drop(ufo_shape_duration.index[[0, 6, 9, 14, 17, 21, 23]])
#ufo_shape_duration.head(30)

ax1 = ufo_shape_duration.plot(kind="scatter", title="Average Sighting Time per Shape (Seconds)", x="shape", y="duration (seconds)",
                              xlabel="Shape", ylabel="Mean Duration (Seconds)", color="green", s=60, figsize=(25,7))

ax1.set_ylim(0, 20000)

plt.show()

In [None]:
ufo_clean = ufo_data.loc[ufo_data["country"]=="us"]
ufo_clean[['month', 'day', 'year']] = ufo_clean['datetime'].str.split('/',expand=True)
ufo_clean[['year', 'time']] = ufo_clean['year'].str.split(' ',expand=True)
ufo_clean=ufo_clean.dropna()
ufo_clean['duration (seconds)']=pd.to_numeric(ufo_clean['duration (seconds)'])
ufo_clean['latitude']=pd.to_numeric(ufo_clean['latitude'])
ufo_clean['longitude']=pd.to_numeric(ufo_clean['longitude ']) #note: 'longitude ' has a space at the end
ufo_clean['month']=pd.to_numeric(ufo_clean['month'])
ufo_clean['day']=pd.to_numeric(ufo_clean['day'])
ufo_clean['year']=pd.to_numeric(ufo_clean['year'])
ufo_clean.dtypes

In [None]:
#3 - Convert UFO Sightings to Geopandas Geodataframe
gdf_sightings = GeoDataFrame( 
    ufo_clean.drop(['longitude ','latitude'],axis=1),
    crs={'init':'epsg:4326'}, #WGS84 coordinate system
    geometry=[Point(xy) for xy in zip(ufo_clean.longitude ,ufo_clean.latitude)])
#gdf_sightings.head()

In [None]:
#4 - Import Military Bases data
military = gpd.read_file("../Resources/military-bases.geojson")
#military.head()


In [None]:
#5 - Adds a buffer of 5 mile to UFO Sightings
ufo_buffer = gdf_sightings
#ufo_buffer = gdf_sightings.loc[gdf_sightings["year"]==2013]
#ufo_buffer = ufo_buffer.loc[ufo_buffer["month"]==7]
ufo_buffer.geometry = gdf_sightings.geometry.buffer(5*(1/60)) #Note: Rough conversion of degree to mile (1/60)
#ufo_buffer.head()

In [None]:
#6 - Spatial Join of UFO Sightings intersecting Military Bases using a buffer of 5 miles
ufo_join = gpd.sjoin(ufo_buffer,military,op="intersects")
#ufo_join

In [None]:
#7 - Create a group based on the values in the 'state' column and count how many times each state has UFO Sightings
overall_state_group = ufo_clean.groupby('state')
count_overall_state = overall_state_group['state'].count()
#count_overall_state
military_state_group = ufo_join.groupby('state')
count_military_state = military_state_group['state'].count()
#count_military_state

In [None]:
#8 - Create a line chart based off of the 'state' group series
plt.figure(figsize=(20,3))
plt.title("UFO Sightings by State: Overall vs. Military")
plt.plot(count_overall_state,color='blue',label="Overall")
plt.plot(count_military_state,color='red',label="Military")
plt.xlabel("State")
plt.ylabel("Number of UFO Sightings")
plt.legend(loc="best")
plt.ylim(0,9000)
plt.grid()
plt.savefig("../Images/OverallvsMilitaryByState.png")
plt.show()

In [None]:
#9 - Create a group based on the values in the 'component' column and count how many times each military branch appears in our group
ufo_join = ufo_join.replace({"AF Active":"Air Force","AF Guard":"Air Force","AF Reserve":"Air Force","Army Active":"Army","Army Guard":"Army","Army Reserve":"Army","MC Active":"Marine Corp","MC Reserve":"Marine Corp","Navy Active":"Navy","WHS":"Other"})
branch_group = ufo_join.groupby('component')
count_branch = branch_group['component'].count()
count_branch = count_branch.reset_index(name='count')
count_branch = count_branch.sort_values(['count'],ascending=False)
count_branch = pd.DataFrame(count_branch)
#count_branch

In [None]:
#10 - Create a pie chart based off the 'component' group series
data = count_branch["count"]
labels = count_branch["component"]
explode = (0.05,0.05,0.05,0.05,0.15)
plt.title("UFO Sightings by Military Branch")
plt.pie(data,explode=explode,labels=labels,autopct="%1.1f%%",shadow=True,startangle=90)
plt.axis("equal")
plt.savefig("../Images/Branches.png")
plt.show()

In [None]:
#11 - Create a group based on the values in the "year" column for both UFO Sightings and Military Bases
group_ufo_years = ufo_clean.groupby('year')
count_ufo_years = group_ufo_years['year'].count()
#count_ufo_years
group_military_years = ufo_join.groupby('year')
count_military_years = group_military_years['year'].count()
#count_military_years

In [None]:
#12 - Create a line graph based off the 'year' group series for Total UFO Sightings vs. Military UFO Sightings
plt.title("UFO Sightings: Overall vs. Military")
plt.plot(count_ufo_years,color='blue',label="Overall")
plt.plot(count_military_years,color='red',label="Military")
plt.xlabel("Years")
plt.ylabel("Number of UFO Sightings")
plt.legend(loc="upper left")
plt.xlim(1998,2014)
plt.ylim(0,7000)
plt.grid()
plt.savefig("../Images/OverallvsMilitary.png")
plt.show()

In [None]:
#----------------------------

ufo_data["shape"].value_counts()

In [None]:
# Combining similar shapes together
ufo_data = ufo_data.replace(
    {"unknown": "other", "delta": "other","round": "other", "changed": "other","pyramid": "other", "hexagon": "other","crescent": "other", "flare": "other"})
ufo_data["shape"].value_counts()

In [None]:
# List all the columns in the table
ufo_data.columns

In [22]:
# Using GroupBy in order to separate the data into fields according to "state" values
grouped_usa_df = ufo_data.groupby(['state','shape'])

# The object returned is a "GroupBy" object and cannot be viewed normally...
print(grouped_usa_df)

# In order to be visualized, a data function must be used...
grouped_usa_df.count()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002392ED5C6D8>


Unnamed: 0_level_0,Unnamed: 1_level_0,datetime,city,country,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
state,shape,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ab,changing,4,4,3,4,4,4,4,4,4
ab,chevron,2,2,1,2,2,2,2,2,2
ab,cigar,10,10,9,10,10,10,10,10,10
ab,circle,29,29,23,29,29,29,29,29,29
ab,cone,4,4,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...,...
yt,light,1,1,1,1,1,1,1,1,1
yt,other,1,1,0,1,1,1,1,1,1
yt,sphere,3,3,1,3,3,3,3,3,3
yt,triangle,2,2,1,2,2,2,2,2,2


In [23]:
shape_counts=grouped_usa_df["shape"].value_counts()
shape_counts

state  shape     shape   
ab     changing  changing     4
       chevron   chevron      2
       cigar     cigar       10
       circle    circle      29
       cone      cone         4
                             ..
yt     light     light        1
       other     other        1
       sphere    sphere       3
       triangle  triangle     2
       unknown   unknown      2
Name: shape, Length: 1304, dtype: int64

In [24]:
# Convert the state_counts Series into a DataFrame
state_shape_counts_df = pd.DataFrame(shape_counts)
state_shape_counts_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,shape
state,shape,shape,Unnamed: 3_level_1
ab,changing,changing,4
ab,chevron,chevron,2
ab,cigar,cigar,10
ab,circle,circle,29
ab,cone,cone,4
...,...,...,...
yt,light,light,1
yt,other,other,1
yt,sphere,sphere,3
yt,triangle,triangle,2


In [25]:
# Convert the column name into "Number of Shapes Per State"
state_shape_counts_df = state_shape_counts_df.rename(
    columns={"shape": "Number of Shapes Per State"})
state_shape_counts_df
state_shape_counts_df.to_csv("Resources/state_shape_counts_dfs.csv", index=True)

In [26]:
most_shapes = state_shape_counts_df.sort_values(["state", "Number of Shapes Per State"], ascending=False)
most_shapes

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Number of Shapes Per State
state,shape,shape,Unnamed: 3_level_1
yt,sphere,sphere,3
yt,circle,circle,2
yt,fireball,fireball,2
yt,triangle,triangle,2
yt,unknown,unknown,2
...,...,...,...
ab,cone,cone,4
ab,egg,egg,3
ab,teardrop,teardrop,3
ab,chevron,chevron,2


In [27]:
# Pie charno_null_ufo_df["shape"].value_counts()t, where the slices will be ordered and plotted counter-clockwise:
fig, ax = plt.subplots(figsize=(20, 15), subplot_kw=dict(aspect="equal"))

labels = ['light','triangle','circle', 'fireball','other','sphere','disk','oval', 'formation','cigar', 'changing','rectangle', 'flash','cylinder', 'diamond', 'chevron','teardrop','egg','cone', 'cross'] 
sizes = [ 13407, 8961, 6511, 6077, 5120, 4310, 4070, 3004, 1979,1627, 1566, 1062, 1061, 1014, 920, 812, 590,577, 246, 185]


wedges, texts = ax.pie(sizes, wedgeprops=dict(width= 0.5), startangle=-40)

bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
kw = dict(arrowprops=dict(arrowstyle="-"),
          bbox=bbox_props, zorder=0, va="center")

for i, p in enumerate(wedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = "angle,angleA=0,angleB={}".format(ang)
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    ax.annotate(labels[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y),
                horizontalalignment=horizontalalignment, **kw)

ax.set_title("           USA UFO Shapes", fontsize=30)
plt.savefig("../Project1/UFO_Shapes_USA.png")
plt.show()

<IPython.core.display.Javascript object>

FileNotFoundError: [Errno 2] No such file or directory: '../Project1/UFO_Shapes_USA.png'

In [None]:
file = "state_shape_counts_dfs.csv"

In [None]:
original_df = pd.read_csv(file)
original_df.head()

In [None]:
best_shape_df= original_df.pivot(index='shape', columns='state', values="Number of Shapes Per State")

In [None]:
state_shapes=original_df.set_index("state")
state_shapes

In [None]:
[best_shape_df['az']/best_shape_df['az'].sum() *100]

In [None]:
shapes_dict={}
for column in best_shape_df.columns[1:]:
    shapes_dict[column]=[best_shape_df[column].values/best_shape_df[column].sum() *100] 
    
shapes_dict['az']

In [None]:
shape_chart_df=pd.DataFrame({k : v [0] for k, v in shapes_dict.items()})

In [None]:
category_names = ['light','triangle','circle', 'fireball','other','sphere','disk','oval', 'formation','cigar', 'changing','rectangle', 'flash','cylinder', 'diamond', 'chevron','teardrop','egg','cone', 'cross'] 
results = { 
   }

for k,v in shapes_dict.items():
    results[k]=np.nan_to_num(v[0])  #nan_to_num to change missing data to 0
    
def survey(results, category_names):
    """
    Parameters
    ----------
    results : dict
        A mapping from question labels to a list of answers per category.
        It is assumed all lists contain the same number of entries and that
        it matches the length of *category_names*.
    category_names : list of str
        The category labels.
    """
    labels = list(results.keys())
    data = np.array(list(results.values()))
    data_cum = data.cumsum(axis=1)
    category_colors = plt.get_cmap('RdYlGn')(
        np.linspace(0.15, 0.85, data.shape[1]))

    fig, ax = plt.subplots(figsize=(9.2, 5))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        ax.barh(labels, widths, left=starts, height=0.5,
                label=colname, color=color)
        xcenters = starts + widths / 2

        r, g, b, _ = color
        text_color = 'white' if r * g * b < 0.5 else 'darkgrey'
        for y, (x, c) in enumerate(zip(xcenters, widths)):
            ax.text(x, y, str(int(c)), ha='center', va='center',
                    color=text_color)
    ax.legend(ncol=len(category_names), bbox_to_anchor=(0, 1),
              loc='lower left', fontsize='small')

    fig.set_size_inches(25, 50)
    fig.suptitle("Shape Percentages Per State",fontsize=50)
    plt.savefig("horizontal_bar.png")
    return fig, ax


survey(results, category_names)
plt.show()

In [None]:
#----------------------------------------

ufo_city_group = ufo_month.groupby(['city']).count()
ufo_city_group.sort_values(by="month", ascending = False).head(10)
#most sightings by city for all sightings reported

In [None]:
ufo_city_group = ufo_month.groupby(['year']).count()
ufo_city_group.sort_values(by="month", ascending = False).head(10)
#most sightings by year for all sightings reported

In [None]:
year_df = ufo_month.loc[(ufo_month['year'] == '2012')]
year_df.count()
#verify how many sightings are in a chosen year

In [None]:
year_df_group = year_df.groupby(['city']).count()
year_df_group.sort_values(by="state", ascending = False).head(10)
#top city sightings in year group

In [None]:
ufo_month.astype({"year":'int64'}).dtypes
#determine data type for latitude and longitude
#change year to integer

In [None]:
top_cities_year_df = year_df.loc[(year_df['city'] == 'seattle') | (year_df['city'] == 'phoenix') | (year_df['city'] == 'las vegas') | (year_df['city'] == 'los angeles') | (year_df['city'] == 'portland') | (year_df['city'] == 'san diego') | (year_df['city'] == 'houston') | (year_df['city'] == 'chicago') | (year_df['city'] == 'tucson') | (year_df['city'] == 'miami')] 
top_cities_year_df['Airport Name'] = ""
top_cities_year_df['Airport Lat'] = ""
top_cities_year_df['Airport Lng'] = ""
cities_airport_df = top_cities_year_df.loc[(top_cities_year_df['state'] != 'me') | (top_cities_year_df['state'] != 'ok')]
cities_airport_df.head(10)

#create dataframe to hold airport lat and lng from API call and to parse down to top sighting cities for chosen year

In [None]:
# Count how many sightings have occured within each city
cities_airport_df["city"].count
ufo_city_only = cities_airport_df.groupby(['city']).count()
ufo_city_only

In [None]:
#put city count to a dataframe
city_count = ufo_city_only.rename(columns={"state": "sightings"}, errors="raise")
city_count = city_count[["sightings"]]
city_count

In [None]:
#reset index so city is a column
city_count = city_count.reset_index()
city_count

In [None]:
#Create a bar chart based upon the above data
plt.figure(figsize=(20,3))
plt.bar(city_count["city"], city_count["sightings"], color='r', alpha=0.5, align="edge")
tick_locations = [value+0.4 for value in x_axis]
plt.xticks(tick_locations, city_count["city"], rotation="vertical")

plt.title("Sightings by City for Year 2012")
plt.xlabel("Cities")
plt.ylabel("Number of Sightings")

In [None]:
#make API call for airport latitude and longitude around cities with most sightings
from config import gkey


# geocoordinates
target_radius = 5000
target_type = "airport"

# set up a parameters dictionary
params = {
    "radius": target_radius,
    "type": target_type,
    "key": gkey
}

for index, row in cities_airport_df.iterrows():
    lat = row['latitude']
    lng = row['longitude ']
    
    params["location"] = f"{lat},{lng}" 
   
    # base url
    base_url = "https://maps.googleapis.com/maps/api/place/nearbysearch/json"

    # run a request using our params dictionary
    response = requests.get(base_url, params=params)
    airport_data = response.json()
    print(airport_data)
    try:
        cities_airport_df.loc[index, 'Airport Name'] = airport_data['results'][0]['name']
        cities_airport_df.loc[index, 'Airport Lat'] = airport_data['results'][0]['geometry']['location']['lat']
        cities_airport_df.loc[index, 'Airport Lng'] = airport_data['results'][0]['geometry']['location']['lng']
      
    except (KeyError, IndexError):
        print("Missing airport")

In [None]:
#display dataframe to verify it populated from API pull
cities_airport_df

In [None]:
File = "Resources/airport_csv.csv"
cities_airport_df.to_csv('file', encoding='utf-8')
cities_airport_df.head(10)

In [None]:
fig = gmaps.figure()

city_locations = cities_airport_df[['latitude', 'longitude ']].astype(float)
markers = gmaps.heatmap_layer(city_locations)
fig.add_layer(markers)
#plot all of the chosen years' sightings

airport_locations2 = cities_airport_df[['Airport Lat', 'Airport Lng']].astype(float)
markers2 = gmaps.heatmap_layer(airport_locations2, gradient=[(0,0,0,0),"blue","white"], point_radius=3, dissipating = False)
fig.add_layer(markers2)
#plot all airports within 5000 m radius of sighting

fig