In [1]:
# Dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Save path to the data set as a variable.
data_file = Path("Resources/car_purchases.csv")

In [3]:
# Use Pandas to the read data.
data_file_df = pd.read_csv(data_file)


In [5]:
df = data_file_df

In [7]:
df

Unnamed: 0,id,Full Name,Gender,Amount,Car
0,1,Minnnie Rean,male,15484.5,Jeep
1,2,Ursa Torricella,female,13443.3,Saturn
2,3,Joyann Pirolini,male,9095.6,Ram
3,4,Sharl Ridsdell,female,11871.6,Dodge
4,5,Laurence Jovasevic,male,13459.8,Chrysler
...,...,...,...,...,...
995,996,Almire Stickins,male,9795.0,Infiniti
996,997,Taite Pocknell,female,19485.4,BMW
997,998,Roselin Bartlomiejczyk,female,19330.8,Mazda
998,999,Amandi Wenban,female,16113.7,Lincoln


In [10]:
# Display a statistical overview of the DataFrame.
df.Amount.describe()

count     1000.000000
mean      9988.738100
std       5783.375372
min         15.300000
25%       5043.150000
50%       9899.500000
75%      15044.225000
max      19927.900000
Name: Amount, dtype: float64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         1000 non-null   int64  
 1   Full Name  1000 non-null   object 
 2   Gender     1000 non-null   object 
 3   Amount     1000 non-null   float64
 4   Car        1000 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 39.2+ KB


In [16]:
df.Car.value_counts().head(10)

Car
Ford             93
Chevrolet        68
Dodge            50
Toyota           47
BMW              44
Mercedes-Benz    40
GMC              40
Mazda            39
Volkswagen       36
Pontiac          34
Name: count, dtype: int64

In [17]:
# Reference a single column within a DataFrame.
df["Amount"].head()

0    15484.5
1    13443.3
2     9095.6
3    11871.6
4    13459.8
Name: Amount, dtype: float64

In [19]:
# Reference multiple columns within a DataFrame.
condensed_df = df[["Amount", "Gender",'Car']].head()
condensed_df

Unnamed: 0,Amount,Gender,Car
0,15484.5,male,Jeep
1,13443.3,female,Saturn
2,9095.6,male,Ram
3,11871.6,female,Dodge
4,13459.8,male,Chrysler


In [20]:
condensed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Amount  5 non-null      float64
 1   Gender  5 non-null      object 
 2   Car     5 non-null      object 
dtypes: float64(1), object(2)
memory usage: 248.0+ bytes


In [21]:
# The mean method averages the series
average = data_file_df["Amount"].mean()
average

9988.738100000002

In [22]:
# The sum method adds every entry in the series
total = data_file_df["Amount"].sum()
total

9988738.100000001

In [23]:
# The unique method shows every element only once
unique = data_file_df["Car"].unique()
unique

array(['Jeep', 'Saturn', 'Ram', 'Dodge', 'Chrysler', 'Cadillac',
       'Pontiac', 'Nissan', 'Lexus', 'Volkswagen', 'Suzuki', 'Kia',
       'Mercury', 'Audi', 'Bugatti', 'BMW', 'Mazda', 'GMC', 'Ford',
       'Mercedes-Benz', 'Land Rover', 'Chevrolet', 'Toyota', 'Honda',
       'Subaru', 'Oldsmobile', 'MINI', 'Lincoln', 'Mitsubishi', 'Isuzu',
       'Infiniti', 'Eagle', 'Saab', 'Buick', 'Volvo', 'Lotus', 'Maserati',
       'Jensen', 'Hyundai', 'Maybach', 'Corbin', 'Acura', 'Ferrari',
       'Plymouth', 'Studebaker', 'Jaguar', 'Rolls-Royce', 'Aston Martin',
       'Merkur', 'Citroën', 'Daewoo', 'Tesla', 'Porsche', 'Scion', 'Geo',
       'Hummer', 'Lamborghini', 'Fiat', 'Bentley', 'Peugeot', 'Austin',
       'Spyker'], dtype=object)

In [24]:
len(unique)

62

In [25]:
df.Car.value_counts()

Car
Ford         93
Chevrolet    68
Dodge        50
Toyota       47
BMW          44
             ..
Merkur        1
Corbin        1
Jensen        1
Bugatti       1
Spyker        1
Name: count, Length: 62, dtype: int64

In [26]:
# The value_counts method counts unique values in a column
count = data_file_df["Gender"].value_counts()
count

Gender
male          455
female        446
non-binary     99
Name: count, dtype: int64

In [28]:
# Calculations can also be performed on Series and added into DataFrames as new columns
thousands_of_dollars = round(df["Amount"]/1000,3)
df["Amount in kUSD"] = thousands_of_dollars

df.head()

Unnamed: 0,id,Full Name,Gender,Amount,Car,Amount in kUSD
0,1,Minnnie Rean,male,15484.5,Jeep,15.484
1,2,Ursa Torricella,female,13443.3,Saturn,13.443
2,3,Joyann Pirolini,male,9095.6,Ram,9.096
3,4,Sharl Ridsdell,female,11871.6,Dodge,11.872
4,5,Laurence Jovasevic,male,13459.8,Chrysler,13.46
