<h1>For Iris.csv dataset</h1>

In [26]:
import numpy as np
import pandas as pd

In [27]:
data = pd.read_csv("/home/te/Desktop/31449_DSBDAL/Iris.csv") #read the Iris.csv file

In [28]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [29]:
data.shape

(150, 6)

In [30]:
data.dtypes

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

<h4> Check whether the dataset contains null values</h4>

In [31]:
data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

<h2 style="font-size: 24px;">Central Tendency Calculation with the help of in-built functions</h2>

In [32]:
from scipy.stats import mode
# Dictionary to store central tendency measures for each numeric column
central_tendency = {}

# Loop through each numeric column in the dataset
for column in data.select_dtypes(include=[float]).columns:
    column_data = data[column]
    mode_result = mode(column_data, nan_policy='omit')  # Handle NaN if any

    central_tendency[column] = {
        "Mean": column_data.mean(),
        "Median": column_data.median(),
        "Mode": mode_value,
        "Midrange": (column_data.min() + column_data.max()) / 2,
    }

central_tendency_df = pd.DataFrame(central_tendency).T
central_tendency_df.index.name = "Attribute"

print("Measures of Central Tendency:")
print(central_tendency_df)


Measures of Central Tendency:
                   Mean  Median  Mode  Midrange
Attribute                                      
SepalLengthCm  5.843333    5.80   0.2      6.10
SepalWidthCm   3.054000    3.00   0.2      3.20
PetalLengthCm  3.758667    4.35   0.2      3.95
PetalWidthCm   1.198667    1.30   0.2      1.30


<h2 style="font-size: 24px;">Central Tendency Calculation without the help of in-built functions</h2>

In [33]:
# Dictionary to store central tendency measures for each column
central_tendency = {}

# Loop through each float column in the dataset
for column in data.select_dtypes(include=[float]).columns:
    column_data = data[column]
    
    # Calculate mean, median, mode, and midrange without using built-in functions
    mean_value = sum(column_data) / len(column_data)
    
    sorted_data = sorted(column_data)
    mid = len(sorted_data) // 2
    
    median_value = (sorted_data[mid] if len(sorted_data) % 2 != 0 
                    else (sorted_data[mid - 1] + sorted_data[mid]) / 2)
    
    freq_dict = {}
    for value in column_data:
        freq_dict[value] = freq_dict.get(value, 0) + 1     
    mode_value = max(freq_dict, key=freq_dict.get)
    
    midrange_value = (min(column_data) + max(column_data)) / 2

    # Store the values in the dictionary
    central_tendency[column] = {
        "Mean": mean_value,
        "Median": median_value,
        "Mode": mode_value,
        "Midrange": midrange_value,
    }

# Convert to a DataFrame for better display
central_tendency_df = pd.DataFrame(central_tendency).T
central_tendency_df.index.name = "Attribute"

print("Measures of Central Tendency for Float Columns:")
print(central_tendency_df)


Measures of Central Tendency for Float Columns:
                   Mean  Median  Mode  Midrange
Attribute                                      
SepalLengthCm  5.843333    5.80   5.0      6.10
SepalWidthCm   3.054000    3.00   3.0      3.20
PetalLengthCm  3.758667    4.35   1.5      3.95
PetalWidthCm   1.198667    1.30   0.2      1.30


In [34]:
data['Species'].value_counts()

Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

<h2>Measure of Dispersion Calculation</h2>

In [35]:
# Convert data into a DataFrame
df = pd.DataFrame(data)

# Selecting only numerical columns for calculation
numerical_data = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]

# Calculate range (difference between max and min), variance, standard deviation and quartiles
range_values = numerical_data.max() - numerical_data.min()
variance = numerical_data.var()
std_deviation = numerical_data.std()
quartiles = numerical_data.quantile([0.25, 0.5, 0.75])

# Display results
print(f"Range:\n{range_values}\n")
print(f"Variance:\n{variance}\n")
print(f"Standard Deviation:\n{std_deviation}\n")
print(f"Quartiles (25%, 50%, 75%):\n{quartiles}")


Range:
SepalLengthCm    3.6
SepalWidthCm     2.4
PetalLengthCm    5.9
PetalWidthCm     2.4
dtype: float64

Variance:
SepalLengthCm    0.685694
SepalWidthCm     0.188004
PetalLengthCm    3.113179
PetalWidthCm     0.582414
dtype: float64

Standard Deviation:
SepalLengthCm    0.828066
SepalWidthCm     0.433594
PetalLengthCm    1.764420
PetalWidthCm     0.763161
dtype: float64

Quartiles (25%, 50%, 75%):
      SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0.25            5.1           2.8           1.60           0.3
0.50            5.8           3.0           4.35           1.3
0.75            6.4           3.3           5.10           1.8


<h2>Statistics for a numeric variables grouped by a categorical variable(Species)</h2>


In [37]:

summary_statistics = data.groupby("Species").agg(
    {
        "SepalLengthCm": ["mean", "median", "min", "max", "std"],
        "SepalWidthCm": ["mean", "median", "min", "max", "std"],
        "PetalLengthCm": ["mean", "median", "min", "max", "std"],
        "PetalWidthCm": ["mean", "median", "min", "max", "std"],
    }
)
# Print nicely formatted summary statistics
print("\nSummary Statistics for Iris Dataset (Grouped by Species):")
print(summary_statistics)



Summary Statistics for Iris Dataset (Grouped by Species):
                SepalLengthCm                            SepalWidthCm         \
                         mean median  min  max       std         mean median   
Species                                                                        
Iris-setosa             5.006    5.0  4.3  5.8  0.352490        3.418    3.4   
Iris-versicolor         5.936    5.9  4.9  7.0  0.516171        2.770    2.8   
Iris-virginica          6.588    6.5  4.9  7.9  0.635880        2.974    3.0   

                                    PetalLengthCm                             \
                 min  max       std          mean median  min  max       std   
Species                                                                        
Iris-setosa      2.3  4.4  0.381024         1.464   1.50  1.0  1.9  0.173511   
Iris-versicolor  2.0  3.4  0.313798         4.260   4.35  3.0  5.1  0.469911   
Iris-virginica   2.2  3.8  0.322497         5.552   5.55  4.

<h1>For nba.csv dataset</h1>

In [38]:
data2 = pd.read_csv("/home/te/Desktop/31449_DSBDAL/nba.csv") #read from nba.csv

In [39]:
data2.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [40]:
data2.shape

(458, 9)

In [41]:
data2.dtypes

Name         object
Team         object
Number      float64
Position     object
Age         float64
Height       object
Weight      float64
College      object
Salary      float64
dtype: object

In [42]:
data2.isnull().sum()

Name         1
Team         1
Number       1
Position     1
Age          1
Height       1
Weight       1
College     85
Salary      12
dtype: int64

In [43]:
columns_to_convert = ['Name','College']
data2[columns_to_convert] = data2[columns_to_convert].apply(lambda x: x.astype('string'))
data2.dtypes

Name        string[python]
Team                object
Number             float64
Position            object
Age                float64
Height              object
Weight             float64
College     string[python]
Salary             float64
dtype: object

In [44]:
data2 = data2.fillna({
    'College': 'Unknown',
    'Salary': data2['Salary'].mean()  # Filling Salary NaN with its mean
})
data2.isnull().sum()

Name        1
Team        1
Number      1
Position    1
Age         1
Height      1
Weight      1
College     0
Salary      0
dtype: int64

<h2 style="font-size: 24px;">Central Tendency Calculation with the help of in-built functions</h2>

In [45]:
from scipy.stats import mode
# Dictionary to store central tendency measures for each numeric column
central_tendency = {}

# Loop through each numeric column in the dataset
for column in data2.select_dtypes(include=[float]).columns:
    column_data = data2[column]
    mode_result = mode(column_data, nan_policy='omit')  # Handle NaN if any

    central_tendency[column] = {
        "Mean": column_data.mean(),
        "Median": column_data.median(),
        "Mode": mode_value,
        "Midrange": (column_data.min() + column_data.max()) / 2,
    }

central_tendency_df = pd.DataFrame(central_tendency).T
central_tendency_df.index.name = "Attribute"

print("Measures of Central Tendency:")
print(central_tendency_df)


Measures of Central Tendency:
                   Mean     Median  Mode    Midrange
Attribute                                           
Number     1.767834e+01       13.0   0.2        49.5
Age        2.693873e+01       26.0   0.2        29.5
Weight     2.215230e+02      220.0   0.2       234.0
Salary     4.842684e+06  2880600.0   0.2  12515444.0


<h2 style="font-size: 24px;">Central Tendency Calculation without in-built functions</h2>

In [46]:
import math
# Dictionary to store central tendency measures for each column
central_tendency = {}

# Loop through each float column in the dataset
for column in data2.select_dtypes(include=[float]).columns:
    column_data = data2[column]
    
    # Calculate mean, median, mode, and midrange without using built-in functions
    filtered_data = [value for value in column_data if not math.isnan(value)]

    # Calculate mean (ignoring NaN)
    mean_value = sum(filtered_data) / len(filtered_data)
    sorted_data = sorted(column_data)
    mid = len(sorted_data) // 2
    
    median_value = (sorted_data[mid] if len(sorted_data) % 2 != 0 
                    else (sorted_data[mid - 1] + sorted_data[mid]) / 2)
    
    freq_dict = {}
    for value in column_data:
        freq_dict[value] = freq_dict.get(value, 0) + 1     
    mode_value = max(freq_dict, key=freq_dict.get)
    
    midrange_value = (min(column_data) + max(column_data)) / 2

    # Store the values in the dictionary
    central_tendency[column] = {
        "Mean": mean_value,
        "Median": median_value,
        "Mode": mode_value,
        "Midrange": midrange_value,
    }

# Convert to a DataFrame for better display
central_tendency_df = pd.DataFrame(central_tendency).T
central_tendency_df.index.name = "Attribute"

print("Measures of Central Tendency for Float Columns:")
print(central_tendency_df)


Measures of Central Tendency for Float Columns:
                   Mean     Median      Mode    Midrange
Attribute                                               
Number     1.767834e+01       13.0       5.0        49.5
Age        2.693873e+01       26.0      24.0        29.5
Weight     2.215230e+02      220.0     220.0       234.0
Salary     4.842684e+06  2880600.0  947276.0  12515444.0


<h2>Statistics for a numeric variable grouped by a categorical variable</h2>

In [55]:

summary_statistics = data2.groupby("Team").agg(
    {
        "Weight": ["mean", "median", "min", "max", "std"],
        "Salary": ["mean", "median", "min", "max", "std"]
    }
)
# Print nicely formatted summary statistics
print("\nSummary Statistics for Iris Dataset (Grouped by Species):")
print(summary_statistics)




Summary Statistics for Iris Dataset (Grouped by Species):
                            Weight                                  \
                              mean median    min    max        std   
Team                                                                 
Atlanta Hawks           221.266667  225.0  172.0  260.0  25.982045   
Boston Celtics          219.466667  220.0  180.0  260.0  25.606547   
Brooklyn Nets           215.600000  216.0  175.0  275.0  24.377390   
Charlotte Hornets       220.400000  210.0  184.0  289.0  29.908909   
Chicago Bulls           218.933333  225.0  161.0  275.0  29.336634   
Cleveland Cavaliers     227.866667  225.0  179.0  275.0  27.655706   
Dallas Mavericks        227.000000  230.0  185.0  275.0  27.836256   
Denver Nuggets          217.533333  218.0  175.0  280.0  26.297384   
Detroit Pistons         222.200000  210.0  172.0  279.0  28.899333   
Golden State Warriors   224.600000  220.0  175.0  273.0  29.642633   
Houston Rockets         220.333

<h2>Measure of Dispersion Calculation</h2>

In [57]:
# Convert data into a DataFrame
df2 = pd.DataFrame(data2)

# Selecting only numerical columns for calculation
numerical_data = df2[['Age', 'Weight', 'Salary']]

# Calculate range (difference between max and min), variance, standard deviation and quartiles
range_values = numerical_data.max() - numerical_data.min()
variance = numerical_data.var()
std_deviation = numerical_data.std()
quartiles = numerical_data.quantile([0.25, 0.5, 0.75])

# Display results
print(f"Range:\n{range_values}\n")
print(f"Variance:\n{variance}\n")
print(f"Standard Deviation:\n{std_deviation}\n")
print(f"Quartiles (25%, 50%, 75%):\n{quartiles}")

Range:
Age             21.0
Weight         146.0
Salary    24969112.0
dtype: float64

Variance:
Age       1.939536e+01
Weight    6.952895e+02
Salary    2.662690e+13
dtype: float64

Standard Deviation:
Age       4.404016e+00
Weight    2.636834e+01
Salary    5.160126e+06
dtype: float64

Quartiles (25%, 50%, 75%):
       Age  Weight     Salary
0.25  24.0   200.0  1100602.0
0.50  26.0   220.0  2880600.0
0.75  30.0   240.0  6323553.0
