In [1]:
import pandas as pd              #importing pandas to handle large datasets.
import matplotlib.pyplot as plt  #importing mathplotlib pyplot for graphs.
import math

In [2]:
def fileread():
    """
    This function asks the user to enter the path to the CSV file to open,
    in case of an error it prints and error and asked the user to input again., and returns a list of tuples.
    
    User input (string): asks user for input
            Example: TempLog/tempLog.csv
    
    Output (list): list of tuples where each line of CSV is a tuple,
                   length of the tuple is equal to the number of values in each line of the CSV.
            Example: 
                     1634036226,21.69,31.95
                     1634036406,21.67,32.07
                     1634036586,21.75,32.16
                     
                     would return
                     
                     [(1634036226,21.69,31.95),
                      (1634036406,21.67,32.07),
                      (1634036586,21.75,32.16)]
    
    """
   
    #run until a valid input has been given  
    while (True):
        #take input for the .csv file path
        path=input("Enter the path to the CSV file:")  
        try:                                    #try to open the given path
            with open(path,'r') as f: 
                lines=f.read().splitlines()  
            dfp=pd.read_csv(path)               #pandas' dataframe
            break                               #stop the while loop if opening file works.
        except:                                 #ask for input again in case of an error.
            print("invalid path, try again")
       
        
    #init empty list, we will store our tuples here    
    data=[]
    
    for i in lines:                             #iterate over every line in the list called lines
        temp=i.split(',')                       #data is csv file, always would be comma seperated. So, split at every ","
        emptytup=()                             #init empty tuple 
        
        for i in range(len(temp)):              #for every element in line
            emptytup=(*emptytup,temp[i])        #append to inited tuple using tuple unpacking
        data.append(emptytup)                   #append tuple to inited list
    print("loaded",len(data)-1,"values")    
    return data, dfp                            #return list as data nad pandas dataframe as dfp

In [3]:
df,dfp=fileread()

Enter the path to the CSV file:
invalid path, try again
Enter the path to the CSV file:TempLog/tempLog.csv
loaded 128801 values


In [4]:
#Functions to calculate and display stats from a dataframe.

def mean(inputList):
    """
    This function calculates the mean of from one input list.
   
    Input (list): List of integers or floats.
            Example: [1,2,3,4,5]
    
    Output (float): mean of the values in the list
    """
    
    mean=(sum(inputList))/len(inputList)
    return mean

def std(inputList):
    """
    This function calculates the standard deviation from one input list. 
    
    Input (list): List of integers or floats.
            Example: [1,2,3,4,5]
            
    Output(float): stand deviation of the input list
    """
    
    m=mean(inputList)
    std=(mean([(i-m)**2 for i in inputList]))**(1/2)
    return std

def percentile(inputList,size):
    """
    This function calculates percentile. This can only take one list, and requires percentile size in %.
    
    Input (list,float): takes one list and percentile size in %.
            Example: a=[1,2,3,4,5], 75
            
    Output(float): percentile value of the sorted input list and percentile size
    """
    
    per=sorted(inputList)[int((size/100)*len(inputList))]
    return per

def desStats(inputList):
    """
    This function takes in other functions and dataframe to display descriptive statistic about the data. 
    Assumes data is contains valid statistics and has been cleaned up.
    
    Input (list): takes list of tuples.
            Example: a=[(1,2,3),(4,5,6),(7,8,9)]
            
    Output(.txt): output a .txt in project folder, the .txt file display statistical data about the data.
            Example: Name:  ['Timestamp', 'Air', 'CPU'] 
                     Mean:  [1646189878.8116164, 21.054133120084543, 31.75321263033757] 
                     STD:  [7106630.236362961, 2.840308601902566, 2.574343413699797] 
                     25%:  [1639905667.0, 19.24, 30.06] 
                     50%:  [1646629747.0, 20.72, 31.57] 
                     75%:  [1652425927.0, 22.54, 33.17] 
                     min:  [1634036226.0, 3.02, 11.5] 
                     max:  [1658221746.0, 31.47, 41.46] 
                     Correlation:  0.9638861694143815

    
    """
    
    #init empty lists for the values
    name=[]
    meanV=[]
    stdV=[]
    tf=[]
    ft=[]
    sf=[]
    minV=[]
    maxV=[]
    count=[]
    #runs for loop for the width of the list, a=[(1,2,3),(4,5,6),(7,8,9)], len(a)=3, loops 3 times.
    for j in range(len(inputList[0])):
        col=[float(i[j]) for i in inputList[1:]] #save the column in a variable.
        
        #append empty list to store data for each loop
        name.append(inputList[0][j])             #Name of the column
        count.append(len(col))                   #count
        meanV.append(mean(col))                  #Mean
        stdV.append(std(col))                    #Standard deviation
        tf.append(percentile(col,25))            #25%
        ft.append(percentile(col,50))            #50%
        sf.append(percentile(col,75))            #75%
        minV.append(min(col))                    #smallest value 
        maxV.append(max(col))                    #largest value
        
    print(" Name: ",name,"\n",
          "Count: ",count,"\n",
          "Mean: ",meanV,"\n",
          "STD: ",stdV ,"\n",
          "25%: ",tf ,"\n",
          "50%: ",ft ,"\n",
          "75%: ",sf ,"\n",
          "min: ",minV ,"\n",
          "max: ",maxV,"\n",
          "Correlation: ", correl(inputList),file=open('log.txt', 'w'))
    return[count,meanV,stdV,minV,tf,ft,sf,maxV,correl(inputList)]
    
def correl(inputList):
    """
    This function calculates Pearson correlation coefficient between two columns. 
    
    Input (list): takes second and third column of the input list.
            Example: a=[(store,fruit,veg),(1,10,10),(2,100,200)]
            
    Output(float): Pearson correlation coefficient of the two list, between 0 to 1.
    """
    
    list1=[float(i[1]) for i in inputList[1:]]                                            #Column 1
    list2=[float(i[2]) for i in inputList[1:]]                                            #Column 2
    mean1=mean(list1)                                                                     #Mean of column 1
    mean2=mean(list2)                                                                     #Mean of column 2
    std1=std(list1)                                                                       #Standard deviation of column 1
    std2=std(list2)                                                                       #Standard deviation of column 2
    cor=(sum([(i-mean1)*(j-mean2) for i,j in zip(list1,list2)]))/(len(list1)*std1*std2)   #Calculate Pearson's correlation r value 
    return cor

In [5]:
stats=desStats(df)

In [6]:
def verify(inputlist,dataframe):
    """
    This function verifies the functions created in this script with Pandas's functions by assert.
    Values compared are printed and assertions fail would be raised if assertion is false.
    
    Input (list,list): desStats and pandas.read.
            Example: stats, dfp
    """
    
    dfpDes=[dataframe.count(),
            dataframe.mean(),
            dataframe.std(ddof=0),                          #changed degree of freedom from default 1 to 0.
            dataframe.min(),
            dataframe.quantile(0.25,interpolation='lower'), #keep interpolation constant with percentile().
            dataframe.quantile(0.50,interpolation='lower'), #keep interpolation constant with percentile().
            dataframe.quantile(0.75,interpolation='lower'), #keep interpolation constant with percentile().
            dataframe.max(),
            dataframe.corr()["CPU"]["Air"]]
    for j in range(dataframe.shape[-1]):
        for k in range(len(inputlist)-1):
            print(float(dfpDes[k][j]),[float(i[j]) for i in inputlist[:-1]][k])
            try:
                assert(float(dfpDes[k][j])==[float(i[j]) for i in inputlist[:-1]][k]) # assert if they are equal.
            except:
                print("Assertion Error: Verify failed")
    print(float(dfpDes[-1]),inputlist[-1])
    try:
                assert(float(dfpDes[-1])==inputlist[-1]) # assert if correls are equal.
    except:
                print("Assertion Error: Verify failed")
     
    print("Results verified successfully")
    

In [7]:
verify(stats,dfp)

128801.0 128801.0
1646189878.8116164 1646189878.8116164
7106630.236362961 7106630.236362961
1634036226.0 1634036226.0
1639905667.0 1639905667.0
1646629747.0 1646629747.0
1652425747.0 1652425747.0
1658221746.0 1658221746.0
128801.0 128801.0
21.054133120084543 21.054133120084543
2.840308601902566 2.840308601902566
3.02 3.02
19.24 19.24
20.72 20.72
22.54 22.54
31.47 31.47
128801.0 128801.0
31.75321263033757 31.75321263033757
2.574343413699797 2.574343413699797
11.5 11.5
30.06 30.06
31.57 31.57
33.17 33.17
41.46 41.46
0.9638861694144325 0.9638861694143815
Assertion Error: Verify failed
Results verified successfully


In [8]:
def logxform(inputList):
    """
    This function calculates logarithm of every element in input list.
    
    Input (list): List of integers or floats.
            Example: [1,2,3,4,5,6,7,8,9]
    Output (list): return a list with log transform applied.
    """
    
    return [math.log(i) for i in inputList]

def meanred(inputList):
    """
    This function performs mean reduction.
    
    Input (list): List of integers or floats.
            Example: [1,2,3,4,5,6,7,8,9]
    Output (list): return a list with mean reduction applied.
    """
    
    mu=mean(inputList)                                  #mean of all the values in the list
    mx=max(inputList)-min(inputList)                    #largest value minus smallest value of the input list
    return [(i-mu)/mx for i in inputList]

def minmaxred(inputList):
    """
    This function performs min max reduction.
    
    Input (list): List of integers or floats.
            Example: [1,2,3,4,5,6,7,8,9]
    Output (list): return a list with min max reduction applied.
    """
    
    mn=min(inputList)                                   #smallest value in input
    mx=max(inputList)                                   #largest value in input
    return [(i-mn)/(mx-mn) for i in inputList]

def applymeanred(inputList):
    """
    This function applies mean reduction to the second and third column of input.
    
    Input (list): List of integers or floats.
            Example: [1,2,3,4,5,6,7,8,9]
    Output (list): list of tuples 
            Example: 
                     [(1634036226,21.69,31.95),
                      (1634036406,21.67,32.07),
                      (1634036586,21.75,32.16)]
    
    """
    
    col0=[float(i[0]) for i in inputList[1:]]           #first column
    col1=meanred([float(i[1]) for i in inputList[1:]])  #second column
    col2=meanred([float(i[2]) for i in inputList[1:]])  #third column
    xformedlist=[]                                      #init empty list
    xformedlist.append(inputList[0])
    for i,j,k in zip(col0,col1,col2):
        xformedlist.append(tuple((i,j,k)))              #append each element from columns to the init list
    return xformedlist

def generateplots(inputList):
    """
    This function takes in other functions and dataframe to plot different graphs with and without transformation.
    Clears plot to prevent overlapping with the previous plot.
    
    Input (list): takes list of tuples.
            Example: a=[(1,2,3),(4,5,6),(7,8,9)]
    Output (.png): save .png for each plot
    """
    
    #No transform
    plt.plot([float(i[1]) for i in inputList[1:]])
    plt.plot([float(i[2]) for i in inputList[1:]])
    plt.savefig("baseplot.png")
    plt.clf()
    
    #log transform
    plt.plot(logxform([float(i[1]) for i in inputList[1:]]))
    plt.plot(logxform([float(i[2]) for i in inputList[1:]]))
    plt.savefig("logxform.png")
    plt.clf()
    
    #mean reduction transform
    plt.plot(meanred([float(i[1]) for i in inputList[1:]]))
    plt.plot(meanred([float(i[2]) for i in inputList[1:]]))
    plt.savefig("meanreduction.png")
    plt.clf()
    
    #min max reduction transform
    plt.plot(minmaxred([float(i[1]) for i in inputList[1:]]))
    plt.plot(minmaxred([float(i[2]) for i in inputList[1:]]))
    plt.savefig("minmaxreduction.png")
    plt.clf()

In [9]:
test=applymeanred(df)

In [10]:
generateplots(df)

<Figure size 432x288 with 0 Axes>