In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('dataset.csv')

In [4]:
print(df.head())
df.info()

   OrderID        Date Region   CustomerName       Product  Quantity  \
0     1001  2023-01-15  North  Alice Johnson        Laptop       2.0   
1     1002  2023-01-16  South    Rahul Mehta  Mobile Phone       5.0   
2     1003  2023-01-17   East    Fatima Noor    Headphones      10.0   
3     1004  2023-01-18   West            NaN        Laptop       1.0   
4     1005  2023-01-19  North     Zoe Carter  Mobile Phone       3.0   

   UnitPrice  TotalSales PaymentMethod  
0      700.0      1400.0   Credit Card  
1      300.0      1500.0           UPI  
2       50.0       500.0    Debit Card  
3      720.0       720.0   Credit Card  
4        NaN         NaN           UPI  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   OrderID        20 non-null     int64  
 1   Date           20 non-null     object 
 2   Region         20 non-null     object 

In [5]:
print("\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
OrderID          0
Date             0
Region           0
CustomerName     2
Product          0
Quantity         1
UnitPrice        2
TotalSales       2
PaymentMethod    0
dtype: int64


In [7]:
numerical_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include='object').columns

for col in numerical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)
        print(f"Filled missing values in numerical column '{col}' with its mean.")

for col in categorical_cols:
    if df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)
        print(f"Filled missing values in categorical column '{col}' with its mode.")

print("\nMissing values after cleaning:")
print(df.isnull().sum())
print("\nDataset Information after cleaning:")


Missing values after cleaning:
OrderID          0
Date             0
Region           0
CustomerName     0
Product          0
Quantity         0
UnitPrice        0
TotalSales       0
PaymentMethod    0
dtype: int64

Dataset Information after cleaning:


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   OrderID        20 non-null     int64  
 1   Date           20 non-null     object 
 2   Region         20 non-null     object 
 3   CustomerName   20 non-null     object 
 4   Product        20 non-null     object 
 5   Quantity       20 non-null     float64
 6   UnitPrice      20 non-null     float64
 7   TotalSales     20 non-null     float64
 8   PaymentMethod  20 non-null     object 
dtypes: float64(3), int64(1), object(5)
memory usage: 1.5+ KB


In [14]:
if 'df' not in locals() or df is None:
    print("Error: DataFrame 'df' is not defined. Please load your data first.")
else:
    print("Available columns:", df.columns.tolist())
    
    sales_cols = [col for col in df.columns if 'sales' in col.lower()]
    if sales_cols:
        print(f"Found potential sales columns: {sales_cols}")
        sales_col = sales_cols[0]
        print(f"Using '{sales_col}' as the sales column")
    else:
        sales_col = None
        print("No sales-related columns found")
    
    if sales_col:
        df[sales_col] = pd.to_numeric(df[sales_col], errors='coerce')
        df[sales_col].fillna(df[sales_col].mean(), inplace=True)
        
        # 1. Sum of sales by region
        if 'Region' in df.columns:
            sales_by_region = df.groupby('Region')[sales_col].sum().sort_values(ascending=False)
            print("\n--- Sum of Sales by Region ---")
            print(sales_by_region)
        else:
            print("Warning: 'Region' column not found for 'Sum of sales by region'.")
        
        # 2. Average sales per product
        if 'Product' in df.columns:
            avg_sales_per_product = df.groupby('Product')[sales_col].mean().sort_values(ascending=False)
            print("\n--- Average Sales per Product ---")
            print(avg_sales_per_product)
            
            # 3. Highest & lowest selling products
            print("\n--- Highest Selling Products ---")
            print(avg_sales_per_product.head(5)) # Top 5 products by average sales
            
            print("\n--- Lowest Selling Products ---")
            print(avg_sales_per_product.tail(5)) # Bottom 5 products by average sales
        else:
            print("Warning: 'Product' column not found for product-related analyses.")
    else:
        print("Error: No suitable sales column found. Please check your dataset.")

Available columns: ['OrderID', 'Date', 'Region', 'CustomerName', 'Product', 'Quantity', 'UnitPrice', 'TotalSales', 'PaymentMethod']
Found potential sales columns: ['TotalSales']
Using 'TotalSales' as the sales column

--- Sum of Sales by Region ---
Region
North    4461.666667
South    3830.000000
West     3121.666667
East     2520.000000
Name: TotalSales, dtype: float64

--- Average Sales per Product ---
Product
Laptop          937.777778
Tablet          900.000000
Mobile Phone    857.333333
Monitor         468.333333
Headphones      415.000000
Keyboard        342.500000
Smart Watch     200.000000
Name: TotalSales, dtype: float64

--- Highest Selling Products ---
Product
Laptop          937.777778
Tablet          900.000000
Mobile Phone    857.333333
Monitor         468.333333
Headphones      415.000000
Name: TotalSales, dtype: float64

--- Lowest Selling Products ---
Product
Mobile Phone    857.333333
Monitor         468.333333
Headphones      415.000000
Keyboard        342.500000
Sma

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[sales_col].fillna(df[sales_col].mean(), inplace=True)


In [15]:
print("\n--- Numerical Field Statistics (using NumPy) ---")

numerical_df = df.select_dtypes(include=np.number)

for column in numerical_df.columns:
    if column == 'Sales': 
        mean_val = np.mean(numerical_df[column])
        median_val = np.median(numerical_df[column])
        std_dev_val = np.std(numerical_df[column])

        print(f"\n--- Statistics for '{column}' ---")
        print(f"Mean: {mean_val:.2f}")
        print(f"Median: {median_val:.2f}")
        print(f"Standard Deviation: {std_dev_val:.2f}")


--- Numerical Field Statistics (using NumPy) ---
