<a href="https://colab.research.google.com/github/Aneliukee/AI/blob/main/AI_L1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Problem**

---



Which possum's body pats are most correlated with age and sex?

> **Descriptive features**



*case* — observation number;

*site* — the site number where the possum was trapped;

*pop* — a factor which classifies the sites as Vic (Victoria), other (New South Wales or Queensland);

*sex* — gender (m for male, f for female);

*age* — age.

> **Target features**

*hdlngth* — head length, in mm.;

*skullw* — skull width, in mm.;

*totlngth* — total length, in cm.;

*tailll* — tail length, in cm.;

*footlgth* — foot length;

*earconch* — ear conch length;

*eye* — distance from medial canthus to lateral canthus of right eye;

*chest* — chest girth (in cm);

*belly* — belly girth (in cm).

# **Data analysis**

---


> **Categorical**

ordinal: `case`, `site`; 

categorical: `pop`;    

binary: `sex`.    

> **Continuous**

numerical: `age`, `hdlngth`, `skullw`, `totlngth`, `taill`, `footlgth`, `earconch`, `eye`, `chest`, `belly`.

> **Data Quality Report**



Script from: https://github.com/benjimr/Data-Summarizer-for-a-Data-Quality-Report/blob/master/Reporter.py

In [136]:
import pandas as pd
import numpy as np
from google.colab import drive
import warnings

def getHeaders():
    
    headersDF = pd.read_csv('/content/feature-names.txt', header=None, nrows=14)
	
    headersList = [""]*len(headersDF)
    for row in range(0,len(headersDF)):
      headersList[row] = headersDF[0][row]
	
    conFeat = ['age', 'hdlngth', 'skullw', 'totlngth', 'taill', 'footlgth', 'earconch', 'eye', 'chest', 'belly']
    catFeat = ['case', 'site', 'Pop', 'sex']

    return headersList, conFeat, catFeat

def processContinuous(conFeat, data):
	conHead = ['Count', 'Miss %', 'Card.', 'Min', '1st Qrt.', 'Mean', 'Median', '3rd Qrt', 'Max', 'Std. Dev.']
	
	conOutDF = pd.DataFrame(index=conFeat, columns=conHead)
	conOutDF.index.name = 'FEATURENAME'
	columns = data[conFeat]

	#COUNT
	count = columns.count()
	conOutDF[conHead[0]] = count
	
	#MISS % - no continuous features have missing data
	percents = [''] * len(conFeat)
	for col in columns:
		percents[conFeat.index(col)] = 0.00

	conOutDF[conHead[1]] = percents
	
	#CARDINALITY
	conOutDF[conHead[2]] = columns.nunique()
	
	#MINIMUM
	conOutDF[conHead[3]] = columns.min()
	
	#1ST QUARTILE
	conOutDF[conHead[4]] = round(columns.quantile(0.25))
	
	#MEAN
	conOutDF[conHead[5]] = round(columns.mean(), 2)
	
	#MEDIAN
	conOutDF[conHead[6]] = round(columns.median())
	
	#3rd QUARTILE
	conOutDF[conHead[7]] = round(columns.quantile(0.75))
	
	#MAX
	conOutDF[conHead[8]] = columns.max()
	
	#STANDARD DEVIATION
	conOutDF[conHead[9]] = round(columns.std(),2)
	
	return conOutDF
	
def processCategorical(catFeat, data):
	catHead = ['Count', 'Miss %', 'Card.', 'Mode', 'Mode Freq', 'Mode %', '2nd Mode', '2nd Mode Freq', '2nd Mode %']

	catOutDF = pd.DataFrame(index=catFeat, columns=catHead)
	catOutDF.index.name = 'FEATURENAME'
	columns = data[catFeat]
	
	#COUNT
	count = columns.count()
	catOutDF[catHead[0]] = count
	
	#CARDINALITY
	catOutDF[catHead[2]] = columns.nunique()

	#preparing arrays for storing data
	amt = len(catFeat)
	missPercents = ['']*amt
	modeFreqs = ['']*amt
	modes = ['']*amt
	modes2 = ['']*amt
	modePercents = ['']*amt
	modeFreqs2 = ['']*amt
	modePercents2 = ['']*amt

	for col in columns:
		values = columns[col].value_counts()
		index = catFeat.index(col)
		
		#MISS %
		try:
			qMarksCount = values.loc[' ?']
			percent = (qMarksCount/count[index]) * 100
			missPercents[index] = round(percent, 2)
			
			#adjust cardinality to account for ? being counted as unique value
			catOutDF['Card.'][index] -= 1
		except Exception as e:
			missPercents[index] = 0.00
		
		#MODES
		mode = values.index[0]
		mode2 = values.index[1]
		modes[index] = mode
		modes2[index] = mode2
		
		#MODE FREQ
		modeCount = values.loc[mode]
		modeCount2 = values.loc[mode2]
		modeFreqs[index] = modeCount
		modeFreqs2[index] = modeCount2

		#MODE %
		miss = missPercents[index]
		
		modePer = (modeCount/(count[index]*((100-miss)/100)))*100
		modePercents[index] = round(modePer, 2)
		
		modePer2 = (modeCount2/(count[index]*((100-miss)/100)))*100
		modePercents2[index] = round(modePer2, 2)

	catOutDF[catHead[1]] = missPercents
	catOutDF[catHead[3]] = modes
	catOutDF[catHead[4]] = modeFreqs
	catOutDF[catHead[5]] = modePercents
	catOutDF[catHead[6]] = modes2
	catOutDF[catHead[7]] = modeFreqs2
	catOutDF[catHead[8]] = modePercents2
	
	return catOutDF

def main():

	warnings.simplefilter(action='ignore', category=Warning)

	allHead, conFeat, catFeat = getHeaders()
	
  #READ DATA, with headers joined on 
	data = pd.read_csv('/content/possum.csv', header = None, nrows = 104, names = allHead)

	#PROCESS DATA
	conOutDF = processContinuous(conFeat, data)
	catOutDF = processCategorical(catFeat, data)

	#WRITE TO FILES
	conOutDF.to_csv("/content/ContinuousReport.csv")
	catOutDF.to_csv("/content/CategoricalReport.csv")

if __name__ == '__main__':
  main()

In [88]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px
from google.colab import drive


Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).
