In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Default data directory to read and write data
workingDirectory = os.path.join(os.getcwd(), "Data")
pd.option_context('display.max_rows', None, 'display.max_columns', None)

<pandas._config.config.option_context at 0x22e2b5d0fd0>

In [2]:
def check_int(s):
    s = str(s)
    if s[0] in ('-', '+'):
        return s[1:].isdigit()
    return s.isdigit()

def read_data(path: str = workingDirectory, fileName: str = "train.csv") -> pd.DataFrame:
	"""
	Given a path, folderName and fileName with extension type, return a pd.dataframe with the data
	:param path: default is current working directory
	:param fileName: default is Train.csv, but can read csv, txt
	:return: pd.Dataframe
	"""
	result = pd.DataFrame.empty
	try:
		if fileName.split(".")[1] in ('csv', 'txt'):
			result = pd.read_csv(os.path.join(path, fileName))
		else:
			raise TypeError("File format not allowed")

	except Exception as e:
		str(e)

	return result

def skew_classification(skew:float) -> str:
	classification = ''
	"""
	classify according to the following skew values:
		Fairly Symmetrical	-0.5 to 0.5
		Moderate Skewed	-0.5 to -1.0 and 0.5 to 1.0
		Highly Skewed	< -1.0 and > 1.0
	:param skew: skew value according to pd.Dataframe.Skew()
	:return: string classification
	"""
	try:
		if(float(skew) >= float(-0.5)) and (float(skew) <= float(0.5)):
			classification = 'Fairly Symmetrical'
		elif ((float(skew) >= float(-1.0))and (float(skew)) < float(-0.5)) or ((float(skew) > float(0.5))and (float(skew)) < float(1.0)):
			classification = 'Moderate Skewed'
		elif (float(skew) < float(-1.0)) or (float(skew) > float(1.0)):
			classification = 'Highly Skewed'
	except Exception as ex:
		classification += 'Error:'+str(ex)

	return classification
def skew_sign(skew:float) -> str:
	classification = '+'
	"""
	classify according to the following skew values:
		skew > 0 than + else -
	:param skew: skew value according to pd.Dataframe.Skew()
	:return: string classification
	"""
	try:
		if(float(skew) < 0):
			classification ='-'
	except Exception as ex:
		classification += 'Error:'+str(ex)

	return classification

In [56]:
	classSet = read_data(fileName="class.txt").transpose()
	trainSet = read_data(fileName="train.csv")
	testSet = read_data(fileName="test.csv")

In [111]:
# getting only the columns that are integers
numericColumns = [col for col in trainSet.columns.values if check_int(col)]
# only printing the columns to work with
# making a copy to work with numerical columns and index as row
df = trainSet[numericColumns].copy(deep=False)
df.index = trainSet['SNO']
df.sort_values(by=['SNO'], ascending=True, inplace=True)

# creating skew column per row
df["skew"] = df.skew(axis=1)

# classifying skew
df['skew_classification'] = df['skew'].apply(skew_classification)
df['skew_sign'] = df['skew'].apply(skew_sign)

df

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,63,64,65,66,67,68,69,skew,skew_classification,skew_sign
SNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A28102_at,30,46,31,31,26,28,35,29,21,22,...,51,71,68,77,56,41,38,0.556750,Moderate Skewed,+
AB000114_at,22,31,19,16,26,24,29,20,23,22,...,24,19,21,22,25,21,17,0.592930,Moderate Skewed,+
AB000115_at,29,70,12,11,14,13,14,18,10,-3,...,61,62,35,30,65,32,25,3.337639,Highly Skewed,+
AB000220_at,76,208,244,39,85,23,634,159,50,23,...,53,28,30,31,31,27,16,3.128941,Highly Skewed,+
AB000381_s_at,17,15,15,11,10,14,16,43,48,45,...,17,16,13,16,32,19,14,0.538037,Moderate Skewed,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z94753_s_at,14,27,20,17,22,12,21,5,4,2,...,32,32,24,38,28,10,16,-0.707327,Moderate Skewed,-
Z95624_at,34,54,51,32,36,32,62,102,87,122,...,26,34,27,27,31,34,32,3.053704,Highly Skewed,+
Z96810_at,7,10,10,7,10,8,12,6,21,14,...,7,7,8,8,2,10,9,3.764985,Highly Skewed,+
Z97054_xpt2_at,125,167,166,180,196,204,150,50,48,9,...,110,142,153,145,157,149,145,-0.409167,Fairly Symmetrical,-


In [112]:
classSet

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59,60,61,62,63,64,65,66,67,68
Class,MED,MED,MED,MED,MED,MED,MED,MED,MED,MED,...,EPD,EPD,EPD,EPD,JPA,JPA,JPA,JPA,JPA,JPA


In [None]:
# skew report to see distribution between sign and classification
df.groupby(['skew_classification','skew_sign']).count().transpose()[:1]

In [127]:
dfTraspose = trainSet[numericColumns].transpose()
dfTraspose = dfTraspose.apply(lambda x: pd.to_numeric(x, downcast='integer'), axis=1)
dfTraspose.hist(subplots=True)

AttributeError: 'Rectangle' object has no property 'subplots'

Error in callback <function flush_figures at 0x000001E68136C8B0> (for post_execute):


KeyboardInterrupt: 