In [1]:
import numpy as np

#i import pandas for data anlysis and manipulations
import pandas as pd

#matplotlib library for plotting 2D graphics
import matplotlib.pyplot as plt



In [2]:
# loading the datasest

df = pd.read_sas("bs04retail.sas7bdat")
df.head()

Unnamed: 0,Store,Con1,Con2
0,1.0,141.0,118.0
1,2.0,184.0,167.0
2,3.0,132.0,137.0
3,4.0,161.0,168.0
4,5.0,176.0,175.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Store   10 non-null     float64
 1   Con1    10 non-null     float64
 2   Con2    10 non-null     float64
dtypes: float64(3)
memory usage: 372.0 bytes


In [4]:
df['Store'] = df['Store'].astype(int)
df.head()

Unnamed: 0,Store,Con1,Con2
0,1,141.0,118.0
1,2,184.0,167.0
2,3,132.0,137.0
3,4,161.0,168.0
4,5,176.0,175.0


In [5]:
df.isnull()


Unnamed: 0,Store,Con1,Con2
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
5,False,False,False
6,False,False,False
7,False,False,False
8,False,False,False
9,False,False,False


# Building the Hypothesis Testing system

In [30]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, ttest_1samp

#Initialize the Hypothesis class
class Hypothesis:
    def __init__(self, df):
        self.df = df
        
        # one tail test method
    def one_tailed_test(self, item, alpha=0.05):
        try:
            df_item = np.array(self.df[item])
            mean_difference = None
            t_stat, pvalue = ttest_1samp(df_item, alpha)
            
            if pvalue <= alpha and t_stat > 0:
                result = f"\n since p_value = {pvalue:.9f} <= {alpha} and {t_stat:.4f} > 0, we Reject the null hypothesis. There is a significant positive difference in {item}."
                
            else:
                result = f"\n since p_value = {pvalue:.9f} > {alpha}, there's no sufficient evidence to reject the null hypothesis H0. There is no significant difference in {item}."
                

            return result, t_stat, pvalue, mean_difference

        except KeyError as e:
            return f"Error: The item {e} not found in the DataFrame. Please enter a valid item name."

        except Exception as e:
            return f"Error: An unexpected error occurred. Details: {str(e)}"
        
        # two tail test method
    def two_tailed_test(self, item1, item2, alpha=0.05):
        try:
            df_item1 = np.array(self.df[item1])
            df_item2 = np.array(self.df[item2])
            
            t_stat, pvalue = ttest_ind(df_item1, df_item2, equal_var=False)  # Using Welch's t-test
            mean_difference = df_item1.mean() - df_item2.mean()
            
            
            if pvalue <= alpha:
                result = f"\nsince p_value = {pvalue:.4f} <= {alpha}, we Reject the null hypothesis. There is a significant difference between {item1} and {item2}."
                
            else:
                result = f"\nsince p_value = {pvalue:.4f} > {alpha}, we cannot reject the null hypothesis H0. There is no significant difference between {item1} and {item2}. hence, the data does not provide enough evidence to support the claim that there is a significant difference between the two items"
                

            return result, t_stat, pvalue, mean_difference

        except KeyError as e:
            return f"Error: One or both items {e} not found in the DataFrame. Please enter valid item names."
        

        except Exception as e:
            return f"Error: An unexpected error occurred. Details: {str(e)}"
        
        
      



In [31]:


def hypothesis_test(data):
    try:
        #  available items for testing
        print("\nAvailable items for testing are:", ", ".join(data.columns))
    

        # Instantiating the Hypothesis class
        tester = Hypothesis(data)
    

         # available test types for user to choose
        print("\nAvailable test types are: one-tailed, two-tailed")


        # Choose type of test (one-tailed or two-tailed)
        test_type = input("\nwhat type of test do you want to perform ?: ").lower()

        if test_type == 'one-tailed':
           
        # User: Choose item to test
            item = input("\nplease enter the name of the item/product you want to test: ").strip()

            # User input: Choose significance level (alpha)
            alpha = float(input("\nplease Enter significance levelfor your test (alpha): "))

            result, t_stat, pvalue, mean_difference = tester.one_tailed_test(item, alpha)
            
            
        elif test_type == 'two-tailed':
            # User input: Choose items to test
            items = input("\nplease enter the names of the items you want to test (comma-separated): ").split(',')

            # User input: Choose significance level (alpha)
            alpha = float(input("\nEnter significance level (alpha): "))

            result, t_stat, pvalue, mean_difference = tester.two_tailed_test(items[0], items[1], alpha)
            
             # Calculate Pearson correlation between the two items
            correlation = data.corr().loc[items[0], items[1]]
            
        
        else:
            result = "Error: Invalid test type. Please choose either 'one-tailed' or 'two-tailed'."
            correlation = None

        # Print the t-test results
        print(result)

            # Print additional details if available
        if mean_difference is not None:
            print(f"\nmean Difference between {items[0]} and {items[1]}: {mean_difference:.2f}")
            if mean_difference >0:
                print(f"\nthe data constitute significant evidence that the underlying mean No was greater for {items[0]}, by an estimated value of {mean_difference:.3f}. the result suggest that {items[0]} should be preferred")
            else:
                print(f"\nthe data constitute significant evidence that the underlying mean No was greater for {items[1]}, by an estimated value of {abs(mean_difference):.3f}. the result suggest that {items[1]} should be preferred")
            

        # Print the t-statistic and p-value
        print(f"\nt-statistic: {t_stat:.4f}")
        print(f"\npvalue: {pvalue:.9f}")
        
        
          # Output table containing sample size, mean, variance, and Pearson correlation
        stat_table = pd.DataFrame(columns=['Item', 'Sample Size', 'Mean', 'Variance', f'r of {items[0]} & {items[1]}'])
        for item in data.columns:
            
            sample_size = len(data[item])
            mean = data[item].mean()
            variance = data[item].var()
            pearson_corr = correlation if item == items[1] and test_type == 'two-tailed' else None
            stat_table = pd.concat([stat_table, pd.DataFrame({
            'Item': [item],
            'Sample Size': [sample_size],
            'Mean': [mean],
            'Variance': [variance],
            f'r of {items[0]} & {items[1]}': [pearson_corr]
        })], ignore_index=True)

        print("\nstatistic Table:")
        print(stat_table)
        
    except IndexError:
            return f"Error:invalid items name or index error, please enter the correct item/product names."
        
    except UnboundLocalError:
        return f"UnboundLocalError: Invalid test type. Please choose either 'one-tailed' or 'two-tailed'."
    
    except ValueError:
        return f"ValueError: please enter numeric value and not character."
    
    except:
        print("Some other exception happened.")



In [32]:
hypothesis_test(df)


Available items for testing are: Store, Con1, Con2

Available test types are: one-tailed, two-tailed

what type of test do you want to perform ?: two-tailed

please enter the names of the items you want to test (comma-separated): Con1,Con2

Enter significance level (alpha): 0.05

since p_value = 0.3015 > 0.05, we cannot reject the null hypothesis H0. There is no significant difference between Con1 and Con2. hence, the data does not provide enough evidence to support the claim that there is a significant difference between the two items

mean Difference between Con1 and Con2: 13.20

the data constitute significant evidence that the underlying mean No was greater for Con1, by an estimated value of 13.200. the result suggest that Con1 should be preferred

t-statistic: 1.0638

pvalue: 0.301487919

statistic Table:
    Item Sample Size   Mean    Variance  r of Con1 & Con2
0  Store          10    5.5    9.166667               NaN
1   Con1          10  172.6  750.266667               NaN
2  