In [109]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [110]:
# Importing any additional libraries that may be useful

import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

In [111]:
# Reading in the data
df = pd.read_csv("../input/restaurant-business-rankings-2020/Top250.csv")

In [112]:
# Viewing the data
df.head(10)

In [113]:
# Looking for missing data using for loops
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

# Seems to be that 'Content' and 'Headquarters' are missing data
# Upon reviewing, the missing data is not pertinent to the analysis

In [114]:
# Data Types for our columns

print(df.dtypes)

In [115]:
# Continuously cleaning data before analysis

df.drop_duplicates()


In [116]:
plt.scatter(x = df['Sales'], y = df['Rank'])
plt.show

plt.title('Sales vs Rank')
plt.xlabel('# of Sales')
plt.ylabel('Rank')
# Obvious correlation between 'Sales' and 'Rank'
# Curious to look for other correlation between other variables. 

In [117]:
plt.scatter(x = df['Sales'], y = df['Units'])
plt.show

plt.title('Sales vs Units')
plt.xlabel('# of Sales')
plt.ylabel('Units')

# Units relating to the amount of locations that specific restaurant actively has
# Some outliers represeted but generally seems like a positive correlation

In [118]:
# Quick glance at correlation between 'Sales' and 'Units'

sns.regplot(x="Sales", y="Units", data=df)


In [119]:
# Forming the correlation matrix

df.corr(method ='pearson')

In [120]:
correlation_matrix = df.corr()

sns.heatmap(correlation_matrix, annot = True)

plt.title("Correlation Matrix for Variables T1")


plt.show()

# Easier way to visualize correlations between 
# numeric values within the dataset.


In [121]:
# Found correlation between 'Sales' and 'Units' although 
# just because a company managed to have more units did not 
# mean that they necessarily placed better in the rankings. 

In [122]:
# Exploring new dataset
df2 = pd.read_csv("../input/restaurant-business-rankings-2020/Independence100.csv")

In [123]:
df2.head(10)

In [124]:
# Looking for missing data using for loops in second data set

for col in df2.columns:
    pct_missing = np.mean(df2[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))
    
 # No data seems to be missing as represented by the "0%"

In [125]:
print(df2.dtypes)

# 'Sales' and 'Meals Served' columns seem to have an unneeded decimal point
# so it may be better to rid that to clean up the data

In [126]:
df2['Sales'] = df2['Sales'].astype('int64')

df2 ['Meals Served'] = df2['Meals Served'].astype('int64')

 # 'Sales' and 'Meals Served' data type changed from float64 to int64 
 # for increased readability

In [127]:
print(df2.dtypes)

In [128]:
plt.scatter(x = df2['Sales'], y = df2['Rank'])
plt.show

plt.title('Sales Amount vs Rank')
plt.xlabel('Sales Amount')
plt.ylabel('Rank')

# Rank is solely based on sales amount, visualizing anyway to 
# see if there are any outliers

In [129]:
sns.regplot(x="Sales", y="Rank", data=df2)

# Correlation seems to be strongly negative as predicted,
# as the amount of sales go down, so does the rank


In [130]:
plt.scatter(x = df2['Sales'], y = df2['Average Check'])
plt.show

plt.title('Sales vs Average Check')
plt.xlabel('Sales')
plt.ylabel('Check Amount')

# Vizualiztation correlation between 'average check' and 'Sales'
# Difficult to point out any correlation at the moment but seems like
# it's barely sloping positively

In [131]:
sns.regplot(x="Sales", y="Average Check", data=df2)

# Regression line shows weak correlation 


In [132]:
plt.scatter(x = df2['Rank'], y = df2['Average Check'])
plt.show

plt.title('Rank vs Average Check')
plt.xlabel('Rank')
plt.ylabel('Average Check in $')

# At first glance, datapoints seem scattered and correlation is not 
# easily detectable

In [133]:
sns.regplot(x="Rank", y="Average Check", data=df2)

# Data showing another weak correlation here

In [134]:
# May be more useful to find out the correlation coefficient to 
# better analyze the relationship between the variables 

df2.corr(method ='pearson')

In [135]:
# Here are those above results visualized

correlation_matrix = df2.corr()

sns.heatmap(correlation_matrix, annot = True)

plt.title("Correlation Matrix for Variables T2")


plt.show()

# Data seems to show various weak correlations between the 
# variables except for the obvious one of Rank and Sales. 

# The next "strongest" correlation would be between the Average Check
# and Meals Served albeit a strong negative correlation.
# The higher the average check, the lower the amount of meals served. 


In [136]:
# Exploring third dataset
df3 = pd.read_csv("../input/restaurant-business-rankings-2020/Future50.csv")

In [137]:
# Peek at data
df3.head(10)

In [138]:
plt.scatter(x = df3['Rank'], y = df3['YOY_Sales'])
plt.show

plt.title('Yearly Sales Increase vs Rank')
plt.xlabel('Rank')
plt.ylabel('% Sales Increase')
plt.yticks([1,10,20,30,40,50])



In [139]:
# This 'Future 50' data seems to rank based "YOY_ Sales"
# rather than 'Sales' as per the "Top 50" data set
# Higher % increases in sales = a higher rank overall 

# Looking for missing data using for loops in third data set

for col in df3.columns:
    pct_missing = np.mean(df3[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))
    
 # No data seems to be missing as represented by the "0%"


In [140]:
print(df3.dtypes)

# no floats so data shouldn't be as messy to work with


In [148]:
plt.scatter(x = df3['Units'], y = df3['YOY_Sales'])
plt.show

plt.title('Units vs. Sales Increase')
plt.xlabel('Units')
plt.ylabel('% Sales')
plt.yticks([1,10,20,30,40,50])




In [142]:
# Amount of units does not seem to have a huge bearing
# on whether or not the "% Sales" increase. For example,
# you can see 3 plots averaging around a 120% increase in sales
# where they are pegged at around 20 units, 30 units and 110 units 
# on the x axis. 

In [143]:
df3.corr(method ='pearson')

# Finding correlation between other variables in this third
# and final data set.

In [144]:
correlation_matrix = df3.corr()

sns.heatmap(correlation_matrix, annot = True)

plt.title("Correlation Matrix for Variables T3")


plt.show()

In [None]:
# Table seems to show generally weak correlations except for Units and Unit
# Volume. Those two variables show a strong negative correlation (-0.71). 