In [1]:
# Import pandas library for data handling
import pandas as pd

In [2]:
# Load the dataset from the 'data' directory
file_path = 'data/raw_tours.csv'  # Make sure your file is named and saved here
df = pd.read_csv(file_path)

In [3]:
# ---------------------------------------
# 1. View the first 5 rows of the dataset
# ---------------------------------------
print("🔍 Preview of the dataset:")
print(df.head())  # Gives you a snapshot of the first few rows

🔍 Preview of the dataset:
   Rank  Peak All Time Peak  Actual gross Actual gross(in 2022 dollars)  \
0     1     1             2  $780,000,000                  $780,000,000   
1     2     1          7[2]  $579,800,000                  $579,800,000   
2     3  1[4]          2[5]  $411,000,000                  $560,622,615   
3     4  2[7]         10[7]  $397,300,000                  $454,751,555   
4     5  2[4]           NaN  $345,675,146                  $402,844,849   

         Artist                   Tour title    Year(s)  Shows Average gross  \
0  Taylor Swift              The Eras Tour †  2023–2024     56   $13,928,571   
1       Beyoncé       Renaissance World Tour       2023     56   $10,353,571   
2       Madonna  Sticky & Sweet Tour ‡[4][a]  2008–2009     85    $4,835,294   
3          Pink  Beautiful Trauma World Tour  2018–2019    156    $2,546,795   
4  Taylor Swift      Reputation Stadium Tour       2018     53    $6,522,173   

  Ref.  
0  [1]  
1  [3]  
2  [6]  
3  [7]

In [4]:
# ----------------------------------------------------------
# 2. Check the structure of the DataFrame (column names, types)
# ----------------------------------------------------------
print("\n📄 DataFrame summary (types, non-null counts):")
print(df.info())  # Useful for spotting columns with missing values


📄 DataFrame summary (types, non-null counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 11 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Rank                           20 non-null     int64 
 1   Peak                           9 non-null      object
 2   All Time Peak                  6 non-null      object
 3   Actual gross                   20 non-null     object
 4   Actual gross(in 2022 dollars)  20 non-null     object
 5   Artist                         20 non-null     object
 6   Tour title                     20 non-null     object
 7   Year(s)                        20 non-null     object
 8   Shows                          20 non-null     int64 
 9   Average gross                  20 non-null     object
 10  Ref.                           20 non-null     object
dtypes: int64(2), object(9)
memory usage: 1.8+ KB
None


In [5]:
# -----------------------------------------
# 3. Check for missing (null) values in each column
# -----------------------------------------
print("\n❗ Missing values per column:")
print(df.isnull().sum())  # Shows how many nulls are in each column


❗ Missing values per column:
Rank                              0
Peak                             11
All Time Peak                    14
Actual gross                      0
Actual gross(in 2022 dollars)     0
Artist                            0
Tour title                        0
Year(s)                           0
Shows                             0
Average gross                     0
Ref.                              0
dtype: int64


In [6]:
# -----------------------------------------
# 4. Check for duplicated rows
# -----------------------------------------
print("\n🔁 Number of duplicated rows:")
print(df.duplicated().sum())


🔁 Number of duplicated rows:
0


In [7]:
# -----------------------------------------
# 5. Summary of numeric and object (text) columns
# -----------------------------------------
print("\n📊 Descriptive statistics (numerical columns):")
print(df.describe())  # Stats like mean, min, max for numeric data

print("\n📝 Descriptive stats (object/categorical columns):")
print(df.describe(include=['object']))  # Text column stats (like unique counts)


📊 Descriptive statistics (numerical columns):
            Rank       Shows
count  20.000000   20.000000
mean   10.450000  110.000000
std     5.942488   66.507617
min     1.000000   41.000000
25%     5.750000   59.000000
50%    10.500000   87.000000
75%    15.250000  134.500000
max    20.000000  325.000000

📝 Descriptive stats (object/categorical columns):
       Peak All Time Peak  Actual gross Actual gross(in 2022 dollars)  \
count     9             6            20                            20   
unique    7             6            20                            20   
top       1             2  $780,000,000                  $780,000,000   
freq      2             1             1                             1   

              Artist       Tour title    Year(s) Average gross Ref.  
count             20               20         20            20   20  
unique             9               20         16            20   20  
top     Taylor Swift  The Eras Tour †  2013–2014   $13,928,571  [

In [8]:
# -----------------------------------------
# 6. Display all column names for reference
# -----------------------------------------
print("\n🧾 Column names:")
print(df.columns.tolist())


🧾 Column names:
['Rank', 'Peak', 'All Time Peak', 'Actual gross', 'Actual gross(in 2022 dollars)', 'Artist', 'Tour title', 'Year(s)', 'Shows', 'Average gross', 'Ref.']
