# A. Code for Statistical Measures

In [239]:
import numpy as np
class Statistics:
    def __init__(self, data):
        self.data = np.array(data)
        '''
        data :  parameter for passing url of data 
        
        '''
        
    def mean(self):
        '''
        mean formula : sum of observations/ no . of observations 
        here i take n as length of elements ( it gives no. of elements present in a particular )
        total --> it takes all the elements and it makes summation of all elements 
        it returns total / n (total summation / no . values )
        '''
        total = 0
        n = len(self.data)
        for i in self.data:
            total += i
        return total / n
    
    def geometric_mean(self):
        
        """
        Calculate the geometric mean of the data stored in the instance.

        The geometric mean is computed by multiplying all values in the data
        and then taking the nth root, where n is the number of elements in the data.

        Returns:
        The geometric mean of the data.
    
        """
        total = 1
        for x in self.data:
            total *= x
        return total ** (1/len(self.data))
      
    def harmonic_mean(self):
        """
        Calculate the harmonic mean of the data stored in the instance.

        The harmonic mean is computed by taking the reciprocal of each value
        in the data, finding the sum of these reciprocals, and then dividing
        the number of elements by the sum.

        Returns:
        The harmonic mean of the data.

        """
        
        n = len(self.data)
        denominators = 0
        for j in self.data:
            denominators += 1/j
        return n / denominators
  
    def mode(self):
        """
         Calculate the mode of the data stored in the instance.

         The mode is the value(s) that appear most frequently in the data.

         Returns:
         The mode(s) of the data. If there are multiple modes,
         a list containing all modes is returned. If there is no mode, returns None.
          
        """
        freq = {}
        for k in range(len(self.data)):
            val = self.data[k]
            if val in freq:
                freq[val] += 1
            else:
                freq[val] = 1
                mode = max(freq, key=freq.get)
        return mode 
    
    
    def median(self):
        '''
        Calculate the median of the data stored in the instance.

        The median is the middle value of a sorted list of numbers.
        If the list has an odd number of elements, the median is the middle element.
        If the list has an even number of elements, the median is the average of the two middle elements.

        Returns:
        The median of the data.
        '''
        
        sorted_data = np.sort(self.data)
        count = len(sorted_data)
        mid = count // 2
        if count % 2 != 0:
            return sorted_data[mid]
        else:  
            return (sorted_data[mid] + sorted_data[mid-1]) / 2
        
    def describe(self):
        
        '''
        it gives statistical information of the given column 
        
        it return all the function we executed on the above , calling the methods from above 
        
        
        '''
        return {"mean" : self.mean(),
                "median": self.median(),
                "mode": self.mode(),
                "std_dev": self.std_dev(),
                "min": self.min_1(),
                "max": self.max_1()
                }
    def variance(self):
        '''
    
        Calculate the variance of the data stored in the instance.

        Variance is a measure of the dispersion of values in a dataset.
        It is the average of the squared differences between each value and the mean.
        
        '''

        mean = self.mean()
        total = 0
        for value in self.data:
            diff = value - mean
            total += diff**2
        return total / len(self.data)
    
    def std_dev(self):
        '''
        Calculate the standard deviation of the data stored in the instance.

        Standard deviation is a measure of the amount of variation or dispersion in a set of values.
        It is the square root of the variance.
        '''
        
        return self.variance() ** (1/2)
  
    def min_1(self):
        ''' 
        Find the minimum value in the data stored in the instance.

        Returns:
        The minimum value in the data.
        '''
        
        val = self.data[0]
        for value in self.data:
            if value < val:
                val = value
        return val
    
    def max_1(self):
        '''
        Find the maximum value in the data stored in the instance.

        Returns:
        The maximum value in the data.
        '''
        
        val = self.data[0]
        for value in self.data:
            if value > val:
                val = value
        return val
    
    def range_1(self):
        
        '''
        it Finds the difference between maximum value and the min value in the data stored in the instance.

        Returns:
        
        it return the difference between max and min number ,calling max and min methods from the above
        '''
        
       
        return self.max_1() - self.min_1()

    
    def quantile(self, quantile):
        
        '''
        Calculate the quantile of the data stored in the instance.

        The quantile is a specific point in a dataset, dividing the data into two subsets:
        one with values below the quantile and one with values above it.
        return :    
        int or float: The value at the specified quantile in the data.
        
        '''
        
        sorted_data = np.sort(self.data)
        index = int(quantile * len(self.data))
        return sorted_data[index]
    def iqr(self):
        
        '''
        
        Calculate the Interquartile Range (IQR) of the data stored in the instance.

        The IQR is a measure of statistical dispersion, representing the range
        between the first quartile (Q1) and the third quartile (Q3) of the data.

        Returns: The Interquartile Range (IQR) of the data.
        
        '''
        
        q1 = self.quantile(0.25)
        q3 = self.quantile(0.75)
        return q3 - q1
    
    def coeff_of_range(self):
        
        '''
        it takes range value and devide with mean.()
        
        calling the above methods for calculationg coefficient of range 
        
        return : it gives coefficent range of data stored in the instance 
        '''
        
        return self.range_1() / self.mean()
    
    def coeff_of_variation(self):
        
        '''
        it takes variance  value and devide with mean.()
        
        calling the above methods for calculationg coefficient of variation 
        
        return : it gives coefficent of variation of data stored in the instance 
        
        '''
        return self.std_dev() / self.mean()
    
    def coeff_of_std_dev(self):
        return self.std_dev() / self.mean() * 100
    
    def coeff_of_quartile_dev(self):
        
        q1 = self.quantile(0.25)
        q3 = self.quantile(0.75)
        iqr = q3 - q1
        return iqr / (q3 + q1)
    
    
    def mean_dev(self):
        
        
        mean = self.mean()
        total_dev = 0
        for value in self.data:
            dev = abs(value - mean)
            total_dev += dev
        return total_dev / len(self.data)
    
    def coeff_of_mean_dev(self):
        
        return self.mean_dev() / self.mean()

## Applying custom method on SepalLenth Column

In [240]:
import pandas as pd

In [241]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv")
column = 'SepalLengthCm'
data =data[column]
x=Statistics(data)
x

<__main__.Statistics at 0x1e7735d8690>

In [242]:
print("Arithmetic Mean:", x.mean())
print("Geometric Mean:", x.geometric_mean())
print("Harmonic Mean:", x.harmonic_mean())
print("Mode:", x.mode())
print("Median:", x.median())
print("Describe:", x.describe())
print("Variance:", x.variance())
print("Standard Deviation:", x.std_dev())
print("Max Value:", x.max_1())
print("Min Value:", x.min_1())
print("Range:", x.range_1())
print("IQR:", x.iqr())
print("Quartile:Q1",x.quantile(0.25))
print("Quartile:Q2",x.quantile(0.50))
print("Quartile:Q3",x.quantile(0.75))
print("Coefficient of Range:", x.coeff_of_range())
print("Coefficient of Variation:", x.coeff_of_variation())
print("Coefficient of Standard Deviation:", x.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", x.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", x.coeff_of_mean_dev())

Arithmetic Mean: 5.843333333333335
Geometric Mean: 5.785720390427729
Harmonic Mean: 5.728905057850834
Mode: 5.0
Median: 5.8
Describe: {'mean': 5.843333333333335, 'median': 5.8, 'mode': 5.0, 'std_dev': 0.8253012917851409, 'min': 4.3, 'max': 7.9}
Variance: 0.6811222222222222
Standard Deviation: 0.8253012917851409
Max Value: 7.9
Min Value: 4.3
Range: 3.6000000000000005
IQR: 1.3000000000000007
Quartile:Q1 5.1
Quartile:Q2 5.8
Quartile:Q3 6.4
Coefficient of Range: 0.6160867084997147
Coefficient of Variation: 0.1412380989934639
Coefficient of Standard Deviation: 14.12380989934639
Coefficient of Quartile Deviation: 0.11304347826086962
Coefficient of Mean Deviation: 0.11766495531469869


# Applying Custom methods on SepalWidth Column

In [243]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv")
column = 'SepalWidthCm'
data =data[column]
x=Statistics(data)
x

<__main__.Statistics at 0x1e7741f1d10>

In [244]:
print("Arithmetic Mean:", x.mean())
print("Geometric Mean:", x.geometric_mean())
print("Harmonic Mean:", x.harmonic_mean())
print("Mode:", x.mode())
print("Median:", x.median())
print("Describe:", x.describe())
print("Variance:", x.variance())
print("Standard Deviation:", x.std_dev())
print("Max Value:", x.max_1())
print("Min Value:", x.min_1())
print("Range:", x.range_1())
print("IQR:", x.iqr())
print("Quartile:Q1",x.quantile(0.25))
print("Quartile:Q2",x.quantile(0.50))
print("Quartile:Q3",x.quantile(0.75))
print("Coefficient of Range:", x.coeff_of_range())
print("Coefficient of Variation:", x.coeff_of_variation())
print("Coefficient of Standard Deviation:", x.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", x.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", x.coeff_of_mean_dev())

Arithmetic Mean: 3.0540000000000007
Geometric Mean: 3.0235822036025914
Harmonic Mean: 2.9931367940540596
Mode: 3.0
Median: 3.0
Describe: {'mean': 3.0540000000000007, 'median': 3.0, 'mode': 3.0, 'std_dev': 0.4321465800705435, 'min': 2.0, 'max': 4.4}
Variance: 0.1867506666666667
Standard Deviation: 0.4321465800705435
Max Value: 4.4
Min Value: 2.0
Range: 2.4000000000000004
IQR: 0.5
Quartile:Q1 2.8
Quartile:Q2 3.0
Quartile:Q3 3.3
Coefficient of Range: 0.7858546168958742
Coefficient of Variation: 0.14150182713508297
Coefficient of Standard Deviation: 14.150182713508297
Coefficient of Quartile Deviation: 0.0819672131147541
Coefficient of Mean Deviation: 0.10906788910718188


# Applying Custom methods on PetalWidth Column

In [245]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv")
column = 'PetalWidthCm'
data =data[column]
x=Statistics(data)
x

<__main__.Statistics at 0x1e77431ac50>

In [246]:
print("Arithmetic Mean:", x.mean())
print("Geometric Mean:", x.geometric_mean())
print("Harmonic Mean:", x.harmonic_mean())
print("Mode:", x.mode())
print("Median:", x.median())
print("Describe:", x.describe())
print("Variance:", x.variance())
print("Standard Deviation:", x.std_dev())
print("Max Value:", x.max_1())
print("Min Value:", x.min_1())
print("Range:", x.range_1())
print("IQR:", x.iqr())
print("Quartile:Q1",x.quantile(0.25))
print("Quartile:Q2",x.quantile(0.50))
print("Quartile:Q3",x.quantile(0.75))
print("Coefficient of Range:", x.coeff_of_range())
print("Coefficient of Variation:", x.coeff_of_variation())
print("Coefficient of Standard Deviation:", x.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", x.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", x.coeff_of_mean_dev())

Arithmetic Mean: 1.1986666666666672
Geometric Mean: 0.8378270050250772
Harmonic Mean: 0.48664645154044783
Mode: 0.2
Median: 1.3
Describe: {'mean': 1.1986666666666672, 'median': 1.3, 'mode': 0.2, 'std_dev': 0.760612618588172, 'min': 0.1, 'max': 2.5}
Variance: 0.5785315555555559
Standard Deviation: 0.760612618588172
Max Value: 2.5
Min Value: 0.1
Range: 2.4
IQR: 1.5
Quartile:Q1 0.3
Quartile:Q2 1.3
Quartile:Q3 1.8
Coefficient of Range: 2.0022246941045596
Coefficient of Variation: 0.6345489031603212
Coefficient of Standard Deviation: 63.45489031603212
Coefficient of Quartile Deviation: 0.7142857142857143
Coefficient of Mean Deviation: 0.5497219132369291


# Applying Custom methods on PetalLength Column

In [247]:
pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv")

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [248]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv")
column = 'PetalLengthCm'
data =data[column]
x=Statistics(data)
x

<__main__.Statistics at 0x1e774287e10>

In [249]:
print("Arithmetic Mean:", x.mean())
print("Geometric Mean:", x.geometric_mean())
print("Harmonic Mean:", x.harmonic_mean())
print("Mode:", x.mode())
print("Median:", x.median())
print("Describe:", x.describe())
print("Variance:", x.variance())
print("Standard Deviation:", x.std_dev())
print("Max Value:", x.max_1())
print("Min Value:", x.min_1())
print("Range:", x.range_1())
print("IQR:", x.iqr())
print("Quartile:Q1",x.quantile(0.25))
print("Quartile:Q2",x.quantile(0.50))
print("Quartile:Q3",x.quantile(0.75))
print("Coefficient of Range:", x.coeff_of_range())
print("Coefficient of Variation:", x.coeff_of_variation())
print("Coefficient of Standard Deviation:", x.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", x.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", x.coeff_of_mean_dev())

Arithmetic Mean: 3.7586666666666693
Geometric Mean: 3.2397566359576
Harmonic Mean: 2.6964718010995794
Mode: 1.5
Median: 4.35
Describe: {'mean': 3.7586666666666693, 'median': 4.35, 'mode': 1.5, 'std_dev': 1.7585291834055201, 'min': 1.0, 'max': 6.9}
Variance: 3.0924248888888854
Standard Deviation: 1.7585291834055201
Max Value: 6.9
Min Value: 1.0
Range: 5.9
IQR: 3.4999999999999996
Quartile:Q1 1.6
Quartile:Q2 4.4
Quartile:Q3 5.1
Coefficient of Range: 1.5697055693508326
Coefficient of Variation: 0.46785983950129095
Coefficient of Standard Deviation: 46.78598395012909
Coefficient of Quartile Deviation: 0.5223880597014925
Coefficient of Mean Deviation: 0.415551614047534


## Trying For another Data 

In [250]:
x=pd.read_csv(r"C:\Users\Bhavani\Downloads\Sales_Data_-_Sales_Data.csv")

In [251]:
x

Unnamed: 0,Region,Country,Item_Type,Sales_Channel,Order_Priority,Order_Date,Order_ID,Ship_Date,Units_Sold,Unit_SellingPrice,Unit_MakingCost,Total_Revenue,Total_Cost,Total_Profit
0,Central America and the Caribbean,Antigua and Barbuda,Baby Food,Online,M,12/20/2013,957081544.0,01-11-2014,552,255.28,159.42,140914.56,87999.84,52914.72
1,Central America and the Caribbean,Panama,Snacks,Offline,C,07-05-2010,301644504.0,7/26/2010,2167,152.58,97.44,330640.86,211152.48,119488.38
2,Europe,Czech Republic,Beverages,Offline,C,09-12-2011,478051030.0,9/29/2011,4778,47.45,31.79,226716.10,151892.62,74823.48
3,Asia,North Korea,Cereal,Offline,L,5/13/2010,892599952.0,6/15/2010,9016,205.70,117.11,1854591.20,1055863.76,798727.44
4,Asia,Sri Lanka,Snacks,Offline,C,7/20/2015,571902596.0,7/27/2015,7542,152.58,97.44,1150758.36,734892.48,415865.88
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Australia and Oceania,New Zealand,Household,Offline,C,04-03-2015,217984473.0,4/20/2015,5305,668.27,502.54,3545172.35,2665974.70,879197.65
4996,Middle East and North Africa,Azerbaijan,Clothes,Offline,L,8/17/2014,169748055.0,10-02-2014,1077,109.28,35.84,117694.56,38599.68,79094.88
4997,Asia,Myanmar,Baby Food,Offline,H,11/23/2016,,12-10-2016,5204,255.28,159.42,1328477.12,829621.68,498855.44
4998,Europe,Finland,Clothes,Online,L,4/22/2014,,05-11-2014,9410,109.28,35.84,1028324.80,337254.40,691070.40


In [252]:
data=Statistics(r"C:\Users\Bhavani\Downloads\Sales_Data_-_Sales_Data.csv")

In [253]:
column='Units_Sold'

In [254]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Sales_Data_-_Sales_Data.csv")
column_to_analyze = 'Units_Sold'
data =data[column_to_analyze]
data=Stats(data)
data

<__main__.Stats at 0x1e774305690>

In [255]:
print("Arithmetic Mean:", data.mean())
print("Geometric Mean:", data.gmean())
print("Harmonic Mean:", data.hmean())
print("Mode:", data.mode())
print("Median:", data.median())
print("Describe:",data.describe())
print("Variance:", data.variance())
print("Standard Deviation:", data.std_dev())
print("Max Value:", data.max1())
print("Min Value:", data.min1())
print("Range:", data.range1())
print("IQR:", data.iqr())
print("Quartile:Q1",data.quantile(0.25))
print("Quartile:Q2",data.quantile(0.50))
print("Quartile:Q3",data.quantile(0.75))
print("Coefficient of Range:",data.coeff_of_range())
print("Coefficient of Variation:", data.coeff_of_variation())
print("Coefficient of Standard Deviation:",data.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", data.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", data.coeff_of_mean_dev())

Arithmetic Mean: 5030.6982
Geometric Mean: 0.0
Harmonic Mean: 768.8050651565511


  total *= x


Mode: 2
Median: 5123.0
Describe: {'mean': 5030.6982, 'median': 5123.0, 'mode': 2, 'std_dev': 2914.2239606311673, 'min': 2, 'max': 9999}
Variance: 8492701.292716807
Standard Deviation: 2914.2239606311673
Max Value: 9999
Min Value: 2
Range: 9997
IQR: 5123
Quartile:Q1 2454
Quartile:Q2 5124
Quartile:Q3 7577
Coefficient of Range: 1.9871993116184152
Coefficient of Variation: 0.5792881712982042
Coefficient of Standard Deviation: 57.92881712982042
Coefficient of Quartile Deviation: 0.5107167779882364
Coefficient of Mean Deviation: 0.5035153161364354
