In [3]:
import numpy as np
import warnings 
warnings.filterwarnings('ignore') # import required libraries

class Statistics:
    def __init__(self, data):
        self.data = np.array(data)
        '''
        parameter :
        data :  for passing url of data 
        
        '''
        
    def mean(self):
        '''
        mean formula : sum of observations/ no . of observations 
        here i take n as length of elements ( it gives no. of elements present in a particular )
        total --> it takes all the elements and it makes summation of all elements 
        it returns total / n (total summation / no . values )
        '''
        total = 0
        n = len(self.data)
        for x in self.data:
            total += x
        return total / n
    
    
    def geometric_mean(self):
        
        """
        Calculate the geometric mean of the data stored in the instance.

        The geometric mean is computed by multiplying all values in the data
        and then taking the nth root, where n is the number of elements in the data.

        Returns:
        The geometric mean of the data.
    
    
        """
        total = 1
        for x in self.data:
            total *= x
        return total ** (1/len(self.data))
      
    def harmonic_mean(self):
        """
        Calculate the harmonic mean of the data stored in the instance.

        The harmonic mean is computed by taking the reciprocal of each value
        in the data, finding the sum of these reciprocals, and then dividing
        the number of elements by the sum.

        Returns:
        The harmonic mean of the data.

        """
        
        n = len(self.data)
        denominators = 0
        for j in self.data:
            denominators += 1/j
        return n / denominators
  
    def mode(self):
        """
         Calculate the mode of the data stored in the instance.

         The mode is the value(s) that appear most frequently in the data.

         Returns:
         The mode(s) of the data. If there are multiple modes,
         a list containing all modes is returned. If there is no mode, returns None.
          
        """
        freq = {}
        for k in range(len(self.data)):
            val = self.data[k]
            if val in freq:
                freq[val] += 1
            else:
                freq[val] = 1
                mode = max(freq, key=freq.get)
        return mode 
    
    
    def median(self):
        '''
        Calculate the median of the data stored in the instance.

        The median is the middle value of a sorted list of numbers.
        If the list has an odd number of elements, the median is the middle element.
        If the list has an even number of elements, the median is the average of the two middle elements.

        Returns:
        The median of the data.
        
        '''
        
        sorted_data = np.sort(self.data)
        count = len(sorted_data)
        mid = count // 2
        if count % 2 != 0:
            return sorted_data[mid]
        else:  
            return (sorted_data[mid] + sorted_data[mid-1]) / 2
        
    def describe(self):
        
        '''
        it gives statistical information of the given column 
        
        it return all the function we executed on the above , calling the methods from above 
        
        
        '''
        return {"mean" : self.mean(),
                "median": self.median(),
                "mode": self.mode(),
                "std_dev": self.std_dev(),
                "min": self.min_1(),
                "max": self.max_1()
                }
    def variance(self):
        '''
    
        Calculate the variance of the data stored in the instance.

        Variance is a measure of the dispersion of values in a dataset.
        It is the average of the squared differences between each value and the mean.
        
        '''

        mean = self.mean()
        total = 0
        for value in self.data:
            diff = value - mean
            total += diff**2
        return total / len(self.data)
    
    def std_dev(self):
        '''
        Calculate the standard deviation of the data stored in the instance.

        Standard deviation is a measure of the amount of variation or dispersion in a set of values.
        It is the square root of the variance.
        
        '''
        
        return self.variance() ** (1/2)
  
    def min_1(self):
        ''' 
        Find the minimum value in the data stored in the instance.

        Returns:
        The minimum value in the data.
        '''
        
        val = self.data[0]
        for value in self.data:
            if value < val:
                val = value
        return val
    
    def max_1(self):
        '''
        Find the maximum value in the data stored in the instance.

        Returns:
        The maximum value in the data.
        '''
        
        val = self.data[0]
        for value in self.data:
            if value > val:
                val = value
        return val
    
    def range_1(self):
        
        '''
        it Finds the difference between maximum value and the min value in the data stored in the instance.

        Returns:
        
        it return the difference between max and min number ,calling max and min methods from the above
        '''
        
       
        return self.max_1() - self.min_1()

    
    def quantile(self, quantile):
        
        '''
        Calculate the quantile of the data stored in the instance.

        The quantile is a specific point in a dataset, dividing the data into two subsets:
        one with values below the quantile and one with values above it.
        return :    
        int or float: The value at the specified quantile in the data.
        
        '''
        
        sorted_data = np.sort(self.data)
        index = int(quantile * len(self.data))
        return sorted_data[index]
    def iqr(self):
        
        '''
        
        Calculate the Interquartile Range (IQR) of the data stored in the instance.

        The IQR is a measure of statistical dispersion, representing the range
        between the first quartile (Q1) and the third quartile (Q3) of the data.

        Returns: The Interquartile Range (IQR) of the data.
        
        '''
        
        q1 = self.quantile(0.25)
        q3 = self.quantile(0.75)
        return q3 - q1
    
    def coeff_of_range(self):
        
        '''
        it takes range value and devide with mean.()
        
        calling the above methods for calculationg coefficient of range 
        
        return : it gives coefficent range of data stored in the instance 
        '''
        
        return self.range_1() / self.mean()
    
    def coeff_of_variation(self):
        
        '''
        it takes variance  value and devide with mean.()
        
        calling the above methods for calculationg coefficient of variation 
        
        return : it gives coefficent of variation of data stored in the instance 
        
        '''
        return self.std_dev() / self.mean()
    
    def coeff_of_std_dev(self): 
        
        '''
        deviding standad deviation/mean 
        
        and multiplying with 100 
        
        '''
        return self.std_dev() / self.mean() * 100
    
    
    def coeff_of_quartile_dev(self):
    
        '''
         Calculate the coefficient of quartile deviation for a given dataset.

         Quartile deviation is a measure of statistical dispersion that indicates the
         spread of the middle 50% of the data. The coefficient of quartile deviation
         is obtained by dividing the interquartile range (IQR) by the sum of the first
         quartile (Q1) and the third quartile (Q3).

        Returns:
        The coefficient of quartile deviation for the dataset.
        '''
        q1 = self.quantile(0.25)
        q3 = self.quantile(0.75)
        iqr = q3 - q1
        return iqr / (q3 + q1)
    
    
    def mean_dev(self):
        '''
        
         Calculate the mean deviation for a given dataset.

         Mean deviation is a measure of the average absolute deviation of each data
         point from the mean of the dataset. It is computed by finding the absolute
         difference between each data point and the mean, summing these differences,
         and then dividing by the total number of data points.

         Returns:
         
         The mean deviation for the dataset.
        
        '''
        
        
        mean = self.mean()
        total_dev = 0
        for value in self.data:
            dev = abs(value - mean)
            total_dev += dev
        return total_dev / len(self.data)
    
    def coeff_of_mean_dev(self): 
        '''
        Calculate the coefficient of mean deviation for a given dataset.

        The coefficient of mean deviation is obtained by dividing the mean deviation
        by the mean of the dataset. It is a measure of the relative average absolute
        deviation of each data point from the mean.

        Returns:
        The coefficient of mean deviation for the dataset.
        '''
    
       
    def covariance(self, other_data):
        """
        Calculates the covariance between two columns in a datasets.

        data: data for which covariance is calculated.

        Returns:
        Covariance between the two columns in a dataset.
        """
        if len(self.data) != len(other_data):
            raise ValueError("Datasets must have the same length for covariance calculation.")
        
        mean_self = self.mean()
        mean_other = np.mean(other_data)
        
        covariance = np.sum((self.data - mean_self) * (other_data - mean_other)) / len(self.data)
        return covariance

    def pearsons_correlation(self, other_data):
        """
        Calculates Pearson's correlation coefficient between two columns in a dataset.

        data: data for which correlation is calculated.

        Returns:
        Pearson's correlation coefficient between the two datasets.
        
        """
        covariance_value = self.covariance(other_data)
        std_dev_self = self.std_dev()
        std_dev_other = np.std(other_data)
        
        correlation = covariance_value / (std_dev_self * std_dev_other)
        return correlation

    def spearmans_correlation(self, other_data):
        """
        Calculates Spearman's rank correlation coefficient between two columns in a dataset.

        data: data for which correlation is calculated.

        Returns:
        Spearman's rank correlation coefficient between the two datasets.
       
       """
        rank_self = np.argsort(self.data)
        rank_other = np.argsort(other_data)
        
        rank_corr = np.corrcoef(rank_self, rank_other)[0, 1]
        return rank_corr

    def rank_correlation(self, other_data):
        """
        Calculates the rank correlation coefficient between two columns in a dataset.

        
        data for which correlation is calculated.

        Returns:
        Rank correlation coefficient between the two columns in  datasets.
        """
        return self.spearmans_correlation(other_data)

## by using above class and methods , we are calculating discriptive statistics on SepalLenth Column in a " iris " data 

In [47]:
import pandas as pd 

In [48]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv")
column = 'SepalLengthCm' #assigning column to the variable
data =data[column] # assigning a column from data to the another variable
x=Statistics(data) # creating object to the Statistics class and passing the data 
x

<__main__.Statistics at 0x27c6faef750>

In [49]:
print("Arithmetic Mean:", x.mean()) # here calling the all functions from the above class and methods
print("Geometric Mean:", x.geometric_mean())
print("Harmonic Mean:", x.harmonic_mean())
print("Mode:", x.mode())
print("Median:", x.median())
print("Describe:", x.describe())
print("Variance:", x.variance())
print("Standard Deviation:", x.std_dev())
print("Max Value:", x.max_1())
print("Min Value:", x.min_1())
print("Range:", x.range_1())
print("IQR:", x.iqr())
print("Quartile:Q1",x.quantile(0.25))
print("Quartile:Q2",x.quantile(0.50))
print("Quartile:Q3",x.quantile(0.75))
print("Coefficient of Range:", x.coeff_of_range())
print("Coefficient of Variation:", x.coeff_of_variation())
print("Coefficient of Standard Deviation:", x.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", x.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", x.coeff_of_mean_dev())

Arithmetic Mean: 5.843333333333335
Geometric Mean: 5.785720390427729
Harmonic Mean: 5.728905057850834
Mode: 5.0
Median: 5.8
Describe: {'mean': 5.843333333333335, 'median': 5.8, 'mode': 5.0, 'std_dev': 0.8253012917851409, 'min': 4.3, 'max': 7.9}
Variance: 0.6811222222222222
Standard Deviation: 0.8253012917851409
Max Value: 7.9
Min Value: 4.3
Range: 3.6000000000000005
IQR: 1.3000000000000007
Quartile:Q1 5.1
Quartile:Q2 5.8
Quartile:Q3 6.4
Coefficient of Range: 0.6160867084997147
Coefficient of Variation: 0.1412380989934639
Coefficient of Standard Deviation: 14.12380989934639
Coefficient of Quartile Deviation: 0.11304347826086962
Coefficient of Mean Deviation: 0.11766495531469869


## by using above class and methods , we are calculating discriptive statistics on  SepalWidth Column  in a " iris " data 

In [50]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv") # reading csv file ,
column = data['SepalWidthCm'] # assigning sepalwidth column to the variable named " column "
x=Statistics(column)
x

<__main__.Statistics at 0x27c6faca650>

In [52]:
print("Arithmetic Mean:", x.mean())
print("Geometric Mean:", x.geometric_mean())
print("Harmonic Mean:", x.harmonic_mean())
print("Mode:", x.mode())
print("Median:", x.median())
print("Describe:", x.describe())
print("Variance:", x.variance())
print("Standard Deviation:", x.std_dev())
print("Max Value:", x.max_1())
print("Min Value:", x.min_1())
print("Range:", x.range_1())
print("IQR:", x.iqr())
print("Quartile:Q1",x.quantile(0.25))
print("Quartile:Q2",x.quantile(0.50))
print("Quartile:Q3",x.quantile(0.75))
print("Coefficient of Range:", x.coeff_of_range())
print("Coefficient of Variation:", x.coeff_of_variation())
print("Coefficient of Standard Deviation:", x.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", x.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", x.coeff_of_mean_dev())

# Covariance with another column (e.g., SepalWidthCm)
other_column = 'SepalWidthCm'
other_data = data[other_column]


print("Covariance for sepalwidth", x.covariance(other_data))

# Pearson's correlation with another column
print("Pearson's correlation for sepalwidth column", x.pearsons_correlation(other_data))


# Spearman's rank correlation with another column
print("Spearman's rank correlation for sepalwidth", x.spearmans_correlation(other_data))

Arithmetic Mean: 3.0540000000000007
Geometric Mean: 3.0235822036025914
Harmonic Mean: 2.9931367940540596
Mode: 3.0
Median: 3.0
Describe: {'mean': 3.0540000000000007, 'median': 3.0, 'mode': 3.0, 'std_dev': 0.4321465800705435, 'min': 2.0, 'max': 4.4}
Variance: 0.1867506666666667
Standard Deviation: 0.4321465800705435
Max Value: 4.4
Min Value: 2.0
Range: 2.4000000000000004
IQR: 0.5
Quartile:Q1 2.8
Quartile:Q2 3.0
Quartile:Q3 3.3
Coefficient of Range: 0.7858546168958742
Coefficient of Variation: 0.14150182713508297
Coefficient of Standard Deviation: 14.150182713508297
Coefficient of Quartile Deviation: 0.0819672131147541
Coefficient of Mean Deviation: 0.10906788910718188
Covariance for sepalwidth 0.1867506666666667
Pearson's correlation for sepalwidth column 1.0000000000000002
Spearman's rank correlation for sepalwidth 1.0


## by using above class and methods , we are calculating discriptive statistics on  petalWidth Column  in a " iris " data 

In [36]:
data=pd.read_csv(r"C:\Users\Bhavani\Downloads\Iris - Iris.csv")
column = data['PetalWidthCm']
x=Statistics(column)
x

<__main__.Statistics at 0x27c6f4fce50>

In [37]:
print("Arithmetic Mean:", x.mean())
print("Geometric Mean:", x.geometric_mean())
print("Harmonic Mean:", x.harmonic_mean())
print("Mode:", x.mode())
print("Median:", x.median())
print("Describe:", x.describe())
print("Variance:", x.variance())
print("Standard Deviation:", x.std_dev())
print("Max Value:", x.max_1())
print("Min Value:", x.min_1())
print("Range:", x.range_1())
print("IQR:", x.iqr())
print("Quartile:Q1",x.quantile(0.25))
print("Quartile:Q2",x.quantile(0.50))
print("Quartile:Q3",x.quantile(0.75))
print("Coefficient of Range:", x.coeff_of_range())
print("Coefficient of Variation:", x.coeff_of_variation())
print("Coefficient of Standard Deviation:", x.coeff_of_std_dev())
print("Coefficient of Quartile Deviation:", x.coeff_of_quartile_dev())
print("Coefficient of Mean Deviation:", x.coeff_of_mean_dev())


Arithmetic Mean: 1.1986666666666672
Geometric Mean: 0.8378270050250772
Harmonic Mean: 0.48664645154044783
Mode: 0.2
Median: 1.3
Describe: {'mean': 1.1986666666666672, 'median': 1.3, 'mode': 0.2, 'std_dev': 0.760612618588172, 'min': 0.1, 'max': 2.5}
Variance: 0.5785315555555559
Standard Deviation: 0.760612618588172
Max Value: 2.5
Min Value: 0.1
Range: 2.4
IQR: 1.5
Quartile:Q1 0.3
Quartile:Q2 1.3
Quartile:Q3 1.8
Coefficient of Range: 2.0022246941045596
Coefficient of Variation: 0.6345489031603212
Coefficient of Standard Deviation: 63.45489031603212
Coefficient of Quartile Deviation: 0.7142857142857143
Coefficient of Mean Deviation: 0.5497219132369291
