In [82]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [83]:
#index_col and skiprows are args that are passable for pd.read

sp500 = pd.read_excel('sp_500_crashes.xlsx', index_col=0, skiprows=1)

# Fix: Select columns correctly and handle missing values
cols_to_fix = ['Adj Close', 'Close', 'High', 'Low', 'Open']
sp500[cols_to_fix] = sp500[cols_to_fix].fillna(0).astype(int)

sp500.head()

Unnamed: 0_level_0,Adj Close,Close,High,Low,Open,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-07-02,1519,1519,1519,1504,1504,2648990000
2007-07-03,1524,1524,1526,1519,1519,1560790000
2007-07-05,1525,1525,1526,1517,1524,2622950000
2007-07-06,1530,1530,1532,1520,1524,2441520000
2007-07-09,1531,1531,1534,1527,1530,2715330000


In [84]:
# More defensive data conversion method
# wonâ€™t break if one column is missing
# Great for inconsistent or unpredictable data sources

# cols_to_fix = ['Adj Close', 'Close', 'High', 'Low', 'Open']
# for col in cols_to_fix:
#     if col in sp500.columns:
#         sp500[col] = sp500[col].fillna(0).astype(int)

In [85]:
sp500.info()
sp500.describe()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1513 entries, 2007-07-02 to 2020-07-31
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Adj Close  1513 non-null   int64
 1   Close      1513 non-null   int64
 2   High       1513 non-null   int64
 3   Low        1513 non-null   int64
 4   Open       1513 non-null   int64
 5   Volume     1513 non-null   int64
dtypes: int64(6)
memory usage: 82.7 KB


Unnamed: 0,Adj Close,Close,High,Low,Open,Volume
count,1513.0,1513.0,1513.0,1513.0,1513.0,1513.0
mean,1520.731659,1520.731659,1532.27495,1507.842036,1520.805684,4555045000.0
std,703.452562,703.452562,706.392707,699.681544,703.381929,1373284000.0
min,676.0,676.0,695.0,666.0,679.0,1025000000.0
25%,1123.0,1123.0,1139.0,1116.0,1125.0,3676890000.0
50%,1313.0,1313.0,1321.0,1305.0,1314.0,4258140000.0
75%,1450.0,1450.0,1461.0,1439.0,1451.0,5250660000.0
max,3386.0,3386.0,3393.0,3378.0,3380.0,11456230000.0


In [86]:
# this is an inbuilt seaborn dataset
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [87]:
# efficiently calculate and insert a new column 'tip_pct'
tips.insert(
    tips.columns.get_loc('tip') + 1,  # position: right after 'tip'
    'tip_pct',                         # new column name
    (tips['tip'] / tips['total_bill']).round(2)  # computed values
)

tips.head()


Unnamed: 0,total_bill,tip,tip_pct,sex,smoker,day,time,size
0,16.99,1.01,0.06,Female,No,Sun,Dinner,2
1,10.34,1.66,0.16,Male,No,Sun,Dinner,3
2,21.01,3.5,0.17,Male,No,Sun,Dinner,3
3,23.68,3.31,0.14,Male,No,Sun,Dinner,2
4,24.59,3.61,0.15,Female,No,Sun,Dinner,4


In [88]:
# use inplace=True here to remove the 'tip_pct' column
# applies the operation directly to this DataFrame instead of returning a copy
# eg tips = tips.drop('tip_pct', axis='columns')  

tips.drop('tip_pct', axis = 'columns', inplace = True)
tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [89]:
flights = sns.load_dataset('flights')
# Convert 'year' column to datetime format, setting day and month as January 1st by default
flights['year'] = pd.to_datetime(flights['year'], format='%Y')
flights.tail()

Unnamed: 0,year,month,passengers
139,1960-01-01,Aug,606
140,1960-01-01,Sep,508
141,1960-01-01,Oct,461
142,1960-01-01,Nov,390
143,1960-01-01,Dec,432


In [90]:
passengers_year = flights.groupby('year', as_index = False)['passengers'].sum()
passengers_year = passengers_year.sort_values(by='passengers', ascending=False)
passengers_year.head()


Unnamed: 0,year,passengers
11,1960-01-01,5714
10,1959-01-01,5140
9,1958-01-01,4572
8,1957-01-01,4421
7,1956-01-01,3939
