##1.

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency, pearsonr

In [None]:

np.random.seed(42)
gender=np.random.choice(['male','female'],size=100)
age=np.random.randint(20,60,size=100)
salary=np.random.randint(30000,80000,size=100)
department=np.random.choice(['IT','HR','Sales','House keeping'],size=100)
df=pd.DataFrame({'Gender':gender,'Age':age,'Salary':salary,'Department':department})
print("Dataset created")
print(df)

Dataset created
    Gender  Age  Salary     Department
0     male   37   51959             IT
1   female   45   35530  House keeping
2     male   53   59320          Sales
3     male   29   33748          Sales
4     male   55   61968             IT
..     ...  ...     ...            ...
95  female   46   32049  House keeping
96  female   32   61616  House keeping
97  female   22   68191             HR
98  female   58   77357             IT
99    male   25   50932             HR

[100 rows x 4 columns]


In [None]:

genders = list(set(df['Gender']))
departments = list(set(df['Department']))

g = df['Gender'].tolist()
d = df['Department'].tolist()

# Contingency table
table = {}
for gender in genders:
    table[gender] = {}
    for dept in departments:
        table[gender][dept] = 0

for i in range(len(g)):
    table[g[i]][d[i]] += 1

# Row totals
row_totals = {gender: sum(table[gender].values()) for gender in genders}#{male:34,female:35}

# Column totals
col_totals = {}
for dept in departments:
    col_totals[dept] = sum(table[gender][dept] for gender in genders)

# Grand total
grand_total = sum(row_totals.values())

# Expected frequencies
expected = {}
for gender in genders:
    expected[gender] = {}
    for dept in departments:
        expected[gender][dept] = (
            row_totals[gender] * col_totals[dept]
        ) / grand_total
print(expected)

# Chi-square
chi_square = 0
for gender in genders:
    for dept in departments:
        o = table[gender][dept]
        e = expected[gender][dept]
        chi_square += ((o - e) ** 2) / e

print("\nContingency Table:", table)
print("Contingency table",pd.DataFrame(table).T)
print("\nExpected Frequencies:", expected)
print("\nChi-square value:", chi_square)


{'female': {'IT': 13.44, 'Sales': 11.2, 'HR': 11.76, 'House keeping': 19.6}, 'male': {'IT': 10.56, 'Sales': 8.8, 'HR': 9.24, 'House keeping': 15.4}}

Contingency Table: {'female': {'IT': 16, 'Sales': 8, 'HR': 11, 'House keeping': 21}, 'male': {'IT': 8, 'Sales': 12, 'HR': 10, 'House keeping': 14}}
Contingency table         IT  Sales  HR  House keeping
female  16      8  11             21
male     8     12  10             14

Expected Frequencies: {'female': {'IT': 13.44, 'Sales': 11.2, 'HR': 11.76, 'House keeping': 19.6}, 'male': {'IT': 10.56, 'Sales': 8.8, 'HR': 9.24, 'House keeping': 15.4}}

Chi-square value: 3.5250463821892386


In [None]:
contigency_table=pd.crosstab(df['Gender'],df['Department'])
print("Contigency Table")
print(contigency_table)

Contigency Table
Department  HR  House keeping  IT  Sales
Gender                                  
female      11             21  16      8
male        10             14   8     12


In [None]:
#Chi squared Test
chi2, p, dof, expected=chi2_contingency(contigency_table)
print("\nChi-square Test Results:")
print("Chi2 Statistic:", chi2)
print("Degrees of Freedom:", dof)
print("p-value:", p)
print("Expected Frequencies:\n", expected)


Chi-square Test Results:
Chi2 Statistic: 3.525046382189238
Degrees of Freedom: 3
p-value: 0.31752818837499064
Expected Frequencies:
 [[11.76 19.6  13.44 11.2 ]
 [ 9.24 15.4  10.56  8.8 ]]


In [None]:
if p < 0.05:
  print("Conclusion: Gender and Department are correlated (not independent).")
else:
  print("Conclusion: No significant correlation between Gender and Department.")

Conclusion: No significant correlation between Gender and Department.


In [None]:
corr, p_value = pearsonr(df['Age'], df['Salary'])
print("\nCorrelation between Age and Salary:")
print("Pearson Correlation Coefficient:", corr)
print("p-value:", p_value)
if p_value < 0.05:
  print("Conclusion: Age and Salary are significantly correlated.")
else:
  print("Conclusion: No significant correlation between Age and Salary.")


Correlation between Age and Salary:
Pearson Correlation Coefficient: -0.10337576686113803
p-value: 0.30606867743201993
Conclusion: No significant correlation between Age and Salary.


In [None]:
import math

x = df['Age'].tolist()
y = df['Salary'].tolist()

n = len(x)

sum_x = sum(x)
sum_y = sum(y)

sum_xy = sum(x[i] * y[i] for i in range(n))
sum_x2 = sum(xi**2 for xi in x)
sum_y2 = sum(yi**2 for yi in y)

numerator = n * sum_xy - sum_x * sum_y

denominator = math.sqrt(
    (n * sum_x2 - sum_x**2) *
    (n * sum_y2 - sum_y**2)
)

r = numerator / denominator

print("Pearson correlation:", r)


Pearson correlation: -0.10337576686113803


## Without Using Library

##2.

In [None]:
import numpy as np
import pandas as pd
np.random.seed(42)
gender=np.random.choice(['male','female'],size=100)
age=np.random.randint(20,60,size=100)
salary=np.random.randint(30000,80000,size=100)
department=np.random.choice(['IT','HR','Sales','House keeping'],size=100)
df=pd.DataFrame({'Gender':gender,'Age':age,'Salary':salary,'Department':department})
print("Dataset created")
print(df)

Dataset created
    Gender  Age  Salary     Department
0     male   37   51959             IT
1   female   45   35530  House keeping
2     male   53   59320          Sales
3     male   29   33748          Sales
4     male   55   61968             IT
..     ...  ...     ...            ...
95  female   46   32049  House keeping
96  female   32   61616  House keeping
97  female   22   68191             HR
98  female   58   77357             IT
99    male   25   50932             HR

[100 rows x 4 columns]


In [None]:
def min_max_normalize(attribute):
    min_val=attribute.min()
    max_value=attribute.max()
    normalized_attribute=(attribute-min_val)/(max_value-min_val)
    return normalized_attribute


In [None]:
def z_score_normalize(attribute):
   mean_val=sum(attribute)/len(attribute)
   std_dev=(sum((i-mean_val) ** 2 for i in attribute)/len(attribute))**0.5
   normalized_attribute=(attribute-mean_val)/std_dev
   return normalized_attribute


In [None]:
ages_minmax=min_max_normalize(df['Age'])
salaries_minmax=min_max_normalize(df['Salary'])
ages_zscore=z_score_normalize(df['Age'])
salaries_zscore=z_score_normalize(df['Salary'])


In [None]:
print("Original Ages:",df['Age'])
print("Min-Max Ages:", ages_minmax)
print("Z-score Ages:", ages_zscore)
print("\nOriginal Salaries:",df['Salary'])
print("Min-Max Salaries:", salaries_minmax)
print("Z-score Salaries:", salaries_zscore)

Original Ages: 0     37
1     45
2     53
3     29
4     55
      ..
95    46
96    32
97    22
98    58
99    25
Name: Age, Length: 100, dtype: int64
Min-Max Ages: 0     0.435897
1     0.641026
2     0.846154
3     0.230769
4     0.897436
        ...   
95    0.666667
96    0.307692
97    0.051282
98    0.974359
99    0.128205
Name: Age, Length: 100, dtype: float64
Z-score Ages: 0    -0.284878
1     0.393403
2     1.071685
3    -0.963160
4     1.241256
        ...   
95    0.478189
96   -0.708805
97   -1.556657
98    1.495611
99   -1.302301
Name: Age, Length: 100, dtype: float64

Original Salaries: 0     51959
1     35530
2     59320
3     33748
4     61968
      ...  
95    32049
96    61616
97    68191
98    77357
99    50932
Name: Salary, Length: 100, dtype: int64
Min-Max Salaries: 0     0.438282
1     0.107405
2     0.586531
3     0.071516
4     0.639861
        ...   
95    0.037299
96    0.632771
97    0.765190
98    0.949792
99    0.417598
Name: Salary, Length: 100, dtype: floa

3.

In [None]:
transactions=[['I1', 'I2', 'I5'],['I2', 'I4'],['I2', 'I3'],['I1', 'I2', 'I4'],['I1', 'I3'],['I2', 'I3'],['I1', 'I3'],['I1', 'I2', 'I3', 'I5'],['I1', 'I2', 'I3']]
n=len(transactions)
min_sup=2
min_conf=0.2
unique_items = set()
for transaction in transactions:
    for item in transaction:
        unique_items.add(item)
unique_items=sorted(list(unique_items))
print("Unique items in transactions:", unique_items)

Unique items in transactions: ['I1', 'I2', 'I3', 'I4', 'I5']


In [3]:
import random

random.seed(42)
num_transactions=5
transactions = []
items=['I1','I2','I3']
for _ in range(num_transactions):

    # random number of items per transaction (1 to 4)
    k = random.randint(1, len(items)) # Changed 4 to len(items)

    # randomly select k unique items
    transaction = random.sample(items, k)

    transactions.append(transaction)

print(transactions)


[['I1', 'I3', 'I2'], ['I1'], ['I3'], ['I3'], ['I3', 'I1', 'I2']]


In [None]:
def apriori_freq_sets(transactions,unique_items,min_sup):
  freq_set={}
  iter=len(unique_items)
  print("unique items: ",unique_items)
  for i in range(iter):
    sets=[]
    if(i==0):
      for j in range(len(unique_items)):
        key=[]
        key.append(unique_items[j])
        sets.append(tuple(key))
    else:
      for j in range(len(unique_items)-i):
          for k in range(j+i,len(unique_items)):
              key=unique_items[j:j+i]
              key.append(unique_items[k])
              for d in deleted:
                if(set(d).issubset(set(key))):
                  continue
              sets.append(tuple(key))
    print(f'{i+1} set')
    print("sets: ",sets)
    support = {key: 0 for key in sets}
    for key,value in support.items():
      for transaction in transactions:
          if(set(key).issubset(set(transaction))):
              support[key]+=1
    deleted=[]
    for key in sets:
      if(support[key]<min_sup):
        deleted.append(key)
        support.pop(key)
    print("support: ",support)
    print("deleted: ",deleted)
    freq_set[f'{i+1} set']=support
  return freq_set


In [None]:
freq_set=apriori_freq_sets(transactions,unique_items,min_sup)

unique items:  ['I1', 'I2', 'I3', 'I4', 'I5']
1 set
sets:  [('I1',), ('I2',), ('I3',), ('I4',), ('I5',)]
support:  {('I1',): 6, ('I2',): 7, ('I3',): 6, ('I4',): 2, ('I5',): 2}
deleted:  []
2 set
sets:  [('I1', 'I2'), ('I1', 'I3'), ('I1', 'I4'), ('I1', 'I5'), ('I2', 'I3'), ('I2', 'I4'), ('I2', 'I5'), ('I3', 'I4'), ('I3', 'I5'), ('I4', 'I5')]
support:  {('I1', 'I2'): 4, ('I1', 'I3'): 4, ('I1', 'I5'): 2, ('I2', 'I3'): 4, ('I2', 'I4'): 2, ('I2', 'I5'): 2}
deleted:  [('I1', 'I4'), ('I3', 'I4'), ('I3', 'I5'), ('I4', 'I5')]
3 set
sets:  [('I1', 'I2', 'I3'), ('I1', 'I2', 'I4'), ('I1', 'I2', 'I5'), ('I2', 'I3', 'I4'), ('I2', 'I3', 'I5'), ('I3', 'I4', 'I5')]
support:  {('I1', 'I2', 'I3'): 2, ('I1', 'I2', 'I5'): 2}
deleted:  [('I1', 'I2', 'I4'), ('I2', 'I3', 'I4'), ('I2', 'I3', 'I5'), ('I3', 'I4', 'I5')]
4 set
sets:  [('I1', 'I2', 'I3', 'I4'), ('I1', 'I2', 'I3', 'I5'), ('I2', 'I3', 'I4', 'I5')]
support:  {}
deleted:  [('I1', 'I2', 'I3', 'I4'), ('I1', 'I2', 'I3', 'I5'), ('I2', 'I3', 'I4', 'I5')]
5

In [None]:
print(freq_set)

{'1 set': {('I1',): 6, ('I2',): 7, ('I3',): 6, ('I4',): 2, ('I5',): 2}, '2 set': {('I1', 'I2'): 4, ('I1', 'I3'): 4, ('I1', 'I5'): 2, ('I2', 'I3'): 4, ('I2', 'I4'): 2, ('I2', 'I5'): 2}, '3 set': {('I1', 'I2', 'I3'): 2, ('I1', 'I2', 'I5'): 2}, '4 set': {}, '5 set': {}}


In [None]:
from itertools import combinations
def association_rules(freq_set, transactions, min_conf):
    rules = []
    n = len(transactions)
    print("Association Rules")
    for k, itemsets in freq_set.items():
        for itemset, support_count in itemsets.items():
            if len(itemset) > 1:
                for i in range(1, len(itemset)):
                    for antecedent in combinations(itemset, i):
                        consequent = tuple(set(itemset) - set(antecedent))
                        support_itemset = support_count / n
                        support_antecedent = sum(1 for t in transactions if set(antecedent).issubset(t)) / n
                        support_consequent = sum(1 for t in transactions if set(consequent).issubset(t)) / n
                        confidence = support_itemset / support_antecedent if support_antecedent > 0 else 0
                        if confidence >= min_conf:
                            lift = confidence / support_consequent if support_consequent > 0 else 0
                            correlation = "Positively correlated" if lift > 1 else "Negatively correlated"
                            print(f"Rule: {antecedent} -> {consequent}, "
                                  f"Confidence: {confidence:.2f}, "
                                  f"Lift: {lift:.2f} ({correlation})")

In [None]:
association_rules(freq_set, transactions, min_conf)

Association Rules
Rule: ('I1',) -> ('I2',), Confidence: 0.67, Lift: 0.86 (Negatively correlated)
Rule: ('I2',) -> ('I1',), Confidence: 0.57, Lift: 0.86 (Negatively correlated)
Rule: ('I1',) -> ('I3',), Confidence: 0.67, Lift: 1.00 (Negatively correlated)
Rule: ('I3',) -> ('I1',), Confidence: 0.67, Lift: 1.00 (Negatively correlated)
Rule: ('I1',) -> ('I5',), Confidence: 0.33, Lift: 1.50 (Positively correlated)
Rule: ('I5',) -> ('I1',), Confidence: 1.00, Lift: 1.50 (Positively correlated)
Rule: ('I2',) -> ('I3',), Confidence: 0.57, Lift: 0.86 (Negatively correlated)
Rule: ('I3',) -> ('I2',), Confidence: 0.67, Lift: 0.86 (Negatively correlated)
Rule: ('I2',) -> ('I4',), Confidence: 0.29, Lift: 1.29 (Positively correlated)
Rule: ('I4',) -> ('I2',), Confidence: 1.00, Lift: 1.29 (Positively correlated)
Rule: ('I2',) -> ('I5',), Confidence: 0.29, Lift: 1.29 (Positively correlated)
Rule: ('I5',) -> ('I2',), Confidence: 1.00, Lift: 1.29 (Positively correlated)
Rule: ('I1',) -> ('I2', 'I3'), Con

In [None]:
from collections import defaultdict

# 1. Define the Transactional Dataset (Horizontal Format)
transactions = [
    ['I1', 'I2', 'I5'],
    ['I2', 'I4'],
    ['I2', 'I3'],
    ['I1', 'I2', 'I4'],
    ['I1', 'I3'],
    ['I2', 'I3'],
    ['I1', 'I3'],
    ['I1', 'I2', 'I3', 'I5'],
    ['I1', 'I2', 'I3']
]

def convert_to_vertical(dataset):
    # Use a dictionary where the key is the Item and value is a list of TIDs
    vertical_map = defaultdict(list)

    # Iterate through transactions with an index acting as the TID
    for index, transaction in enumerate(dataset):
        tid = f"T{index + 1}"  # Creating IDs like T1, T2, T3...
        for item in transaction:
            vertical_map[item].append(tid)

    return dict(vertical_map)

# 2. Perform Conversion
vertical_data = convert_to_vertical(transactions)

# 3. Display the Output
print(f"{'Item ID':<10} | {'Transaction List (TID set)':<30}")
print("-" * 45)

# Sort items for cleaner display (I1, I2, I3...)
for item in sorted(vertical_data.keys()):
    tids = ", ".join(vertical_data[item])
    print(f"{item:<10} | {tids}")

# Show count (Support Count)
print("\nSupport Counts (Summary):")
for item in sorted(vertical_data.keys()):
    print(f"{item}: {len(vertical_data[item])}")

Item ID    | Transaction List (TID set)    
---------------------------------------------
I1         | T1, T4, T5, T7, T8, T9
I2         | T1, T2, T3, T4, T6, T8, T9
I3         | T3, T5, T6, T7, T8, T9
I4         | T2, T4
I5         | T1, T8

Support Counts (Summary):
I1: 6
I2: 7
I3: 6
I4: 2
I5: 2
