In [1]:
# Cell 1: Import libraries and setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests

print("Libraries imported successfully!")

Libraries imported successfully!


In [5]:
# Cell 2: Download the meteorite data (RUN THIS FIRST)
def download_meteorite_data():
    """Download meteorite data from alternative source"""
    
    # Create directories if they don't exist
    os.makedirs('../data/raw', exist_ok=True)
    file_path = '../data/raw/meteorite_landings.csv'
    
    # Alternative dataset URL (reliable source)
    alternative_url = "https://raw.githubusercontent.com/datasets/meteorite-landings/master/data/meteorite-landings.csv"
    
    try:
        print("Downloading meteorite data from alternative source...")
        df = pd.read_csv(alternative_url)
        df.to_csv(file_path, index=False)
        print(f"‚úÖ Data successfully saved to: {file_path}")
        print(f"Dataset shape: {df.shape}")
        return df
    except Exception as e:
        print(f"‚ùå Download failed: {e}")
        print("Creating sample data instead...")
        return create_sample_data()

def create_sample_data():
    """Create sample data if download fails"""
    # Use relative path instead of absolute Windows path
    file_path = '../data/raw/meteorite_landings.csv'
    
    # Create realistic sample data
    np.random.seed(42)
    n_samples = 5000
    
    data = {
        'name': [f'Meteorite_{i:04d}' for i in range(n_samples)],
        'id': range(1000, 1000 + n_samples),
        'nametype': np.random.choice(['Valid', 'Relict'], n_samples, p=[0.95, 0.05]),
        'recclass': np.random.choice(['L5', 'H5', 'LL5', 'L6', 'H6', 'EH4', 'OC'], n_samples),
        'mass (g)': np.random.exponential(1000, n_samples),
        'fall': np.random.choice(['Fell', 'Found'], n_samples, p=[0.2, 0.8]),
        'year': np.random.choice(pd.date_range('1950-01-01', '2020-12-31', freq='D'), n_samples),
        'reclat': np.random.uniform(-90, 90, n_samples),
        'reclong': np.random.uniform(-180, 180, n_samples),
    }
    
    df = pd.DataFrame(data)
    
    # Add realistic missing values
    df.loc[df.sample(frac=0.1).index, 'mass (g)'] = np.nan
    df.loc[df.sample(frac=0.05).index, ['reclat', 'reclong']] = np.nan
    
    df.to_csv(file_path, index=False)
    print(f"üìÅ Sample dataset created at: {file_path}")
    print("‚ö†Ô∏è  This is simulated data for demonstration purposes")
    
    return df

# Download the data
df = download_meteorite_data()

Downloading meteorite data from alternative source...
‚ùå Download failed: HTTP Error 404: Not Found
Creating sample data instead...
üìÅ Sample dataset created at: ../data/raw/meteorite_landings.csv
‚ö†Ô∏è  This is simulated data for demonstration purposes


In [6]:
print("Dataset loaded successfully!")
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst 5 rows:")
df.head()

Dataset loaded successfully!
Shape: (5000, 9)

Columns: ['name', 'id', 'nametype', 'recclass', 'mass (g)', 'fall', 'year', 'reclat', 'reclong']

First 5 rows:


Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong
0,Meteorite_0000,1000,Valid,LL5,141.268446,Found,2002-07-28,-84.398374,30.476075
1,Meteorite_0001,1001,Relict,OC,1140.17754,Found,1970-05-02,85.018923,-42.7572
2,Meteorite_0002,1002,Valid,LL5,1848.056754,Found,1987-10-02,30.984098,-144.394731
3,Meteorite_0003,1003,Valid,EH4,259.381765,Found,1988-05-08,-14.469656,-117.437687
4,Meteorite_0004,1004,Valid,L6,2013.11015,Fell,1980-05-02,-64.552441,64.682408


In [7]:
# Cell 5: Initial Data Exploration
print("Dataset Information:")
print("====================")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nMissing Values:")
print("===============")
missing_values = df.isnull().sum()
missing_percent = (df.isnull().sum() / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
print(missing_info[missing_info['Missing Count'] > 0])

print("\nData Types:")
print("===========")
print(df.dtypes)

Dataset Information:
Shape: (5000, 9)
Columns: ['name', 'id', 'nametype', 'recclass', 'mass (g)', 'fall', 'year', 'reclat', 'reclong']
Memory usage: 1.25 MB

Missing Values:
          Missing Count  Missing Percentage
mass (g)            500                10.0
reclat              250                 5.0
reclong             250                 5.0

Data Types:
name                object
id                   int64
nametype            object
recclass            object
mass (g)           float64
fall                object
year        datetime64[ns]
reclat             float64
reclong            float64
dtype: object


In [8]:
# Cell 6: Handle Missing Values
df_clean = df.copy()

print("Before cleaning:")
print(f"Total rows: {len(df_clean)}")
print(f"Missing values: {df_clean.isnull().sum().sum()}")

# Handle missing values
# For mass - we'll keep rows with missing mass for now
# For coordinates - we'll keep rows with missing coordinates
print("\nMissing values strategy:")
print("- Rows with missing mass will be kept")
print("- Rows with missing coordinates will be kept")

print(f"\nAfter cleaning - Total rows: {len(df_clean)}")
print(f"Missing values remaining: {df_clean.isnull().sum().sum()}")

Before cleaning:
Total rows: 5000
Missing values: 1000

Missing values strategy:
- Rows with missing mass will be kept
- Rows with missing coordinates will be kept

After cleaning - Total rows: 5000
Missing values remaining: 1000


In [9]:
# Cell 7: Fix Data Types
print("Before fixing data types:")
print(df_clean.dtypes)

# Convert year to datetime
df_clean['year'] = pd.to_datetime(df_clean['year'], errors='coerce')

print("\nAfter fixing data types:")
print(df_clean.dtypes)

# Check for any conversion issues
print(f"\nRows with invalid years: {df_clean['year'].isna().sum()}")

Before fixing data types:
name                object
id                   int64
nametype            object
recclass            object
mass (g)           float64
fall                object
year        datetime64[ns]
reclat             float64
reclong            float64
dtype: object

After fixing data types:
name                object
id                   int64
nametype            object
recclass            object
mass (g)           float64
fall                object
year        datetime64[ns]
reclat             float64
reclong            float64
dtype: object

Rows with invalid years: 0


In [10]:
# Cell 8: Data Quality Checks
print("Data Quality Checks:")
print("====================")

# Check for invalid mass values
invalid_mass = df_clean[df_clean['mass (g)'] <= 0]
print(f"Rows with mass <= 0: {len(invalid_mass)}")

# Check for unrealistic coordinates
invalid_lat = df_clean[(df_clean['reclat'] < -90) | (df_clean['reclat'] > 90)]
invalid_long = df_clean[(df_clean['reclong'] < -180) | (df_clean['reclong'] > 180)]
print(f"Rows with invalid latitude: {len(invalid_lat)}")
print(f"Rows with invalid longitude: {len(invalid_long)}")

# Check unique values for categorical columns
print(f"\nUnique nametype values: {df_clean['nametype'].unique()}")
print(f"Unique fall values: {df_clean['fall'].unique()}")
print(f"Number of unique recclass: {df_clean['recclass'].nunique()}")

Data Quality Checks:
Rows with mass <= 0: 0
Rows with invalid latitude: 0
Rows with invalid longitude: 0

Unique nametype values: ['Valid' 'Relict']
Unique fall values: ['Found' 'Fell']
Number of unique recclass: 7


In [11]:
# Cell 9: Save Cleaned Data
# Create cleaned data directory
os.makedirs('../data/cleaned', exist_ok=True)

# Save cleaned dataset
cleaned_file_path = '../data/cleaned/meteorite_landings_cleaned.csv'
df_clean.to_csv(cleaned_file_path, index=False)
print(f"‚úÖ Cleaned data saved to: {cleaned_file_path}")
print(f"Cleaned dataset shape: {df_clean.shape}")

‚úÖ Cleaned data saved to: ../data/cleaned/meteorite_landings_cleaned.csv
Cleaned dataset shape: (5000, 9)


In [12]:
# Cell 10: Basic Summary Statistics
print("Summary Statistics:")
print("==================")

print("\nNumerical Columns:")
print(df_clean[['mass (g)', 'reclat', 'reclong']].describe())

print("\nCategorical Columns:")
print(f"Most common meteorite classes:")
print(df_clean['recclass'].value_counts().head(10))

print(f"\nFall vs Found distribution:")
print(df_clean['fall'].value_counts())

print(f"\nDate range:")
print(f"Earliest: {df_clean['year'].min()}")
print(f"Latest: {df_clean['year'].max()}")

Summary Statistics:

Numerical Columns:
          mass (g)       reclat      reclong
count  4500.000000  4750.000000  4750.000000
mean    996.574792     0.516348     2.230761
std     999.796264    52.434340   103.139797
min       0.185212   -89.963748  -179.992614
25%     296.221748   -44.754143   -85.848970
50%     692.402306    -0.271405     4.536708
75%    1377.487235    46.586909    90.617806
max    8259.734239    89.980391   179.990592

Categorical Columns:
Most common meteorite classes:
recclass
H5     765
L5     753
H6     724
OC     710
LL5    684
L6     683
EH4    681
Name: count, dtype: int64

Fall vs Found distribution:
fall
Found    4019
Fell      981
Name: count, dtype: int64

Date range:
Earliest: 1950-01-04 00:00:00
Latest: 2020-12-30 00:00:00
