In [25]:
import pandas as pd
import numpy as np
import os

from sklearn import svm
from sklearn.linear_model import LinearRegression

Creating the Dataframes of Camera Data

In [26]:
#Set directories for csvs of pose and activity data
actDir = "act_csvs"
camDir = "cam_csvs"
#set the range in seconds where to label frames as interacting
range = 30

In [27]:
#write all filenames of pose csvs into an array
filenames = []
directory = os.fsencode(camDir)
for file in os.listdir(directory):
     filename = os.fsdecode(file)
     if filename.endswith(".csv"):
         filenames.append(filename)

print(filenames)
print(len(filenames))

['2023-03-27T12-36-56.345.csv', '2023-03-31T16-01-24.272.csv', '2023-03-17T11-51-07.683.csv', '2023-03-13T00-36-05.276.csv', '2023-03-09T12-38-23.976.csv', '2023-03-23T10-18-12.982.csv', '2023-03-23T13-38-52.315.csv', '2023-03-23T07-37-16.244.csv', '2023-03-28T15-22-56.875.csv', '2023-03-09T07-44-52.754.csv', '2023-03-28T07-23-54.575.csv', '2023-03-20T13-05-09.221.csv', '2023-03-22T04-55-56.632.csv', '2023-03-01T10-34-05.930.csv', '2023-03-29T11-07-25.692.csv', '2023-03-28T14-34-56.688.csv', '2023-03-08T07-33-16.400.csv', '2023-03-28T06-57-05.629.csv', '2023-03-11T17-11-34.353.csv', '2023-03-28T04-48-26.948.csv', '2023-03-30T10-31-14.737.csv', '2023-03-30T05-09-36.174.csv', '2023-03-14T08-38-20.035.csv', '2023-03-30T10-21-46.778.csv', '2023-03-23T18-13-34.315.csv', '2023-03-19T17-09-19.273.csv', '2023-03-28T09-39-48.986.csv', '2023-03-21T11-37-34.787.csv', '2023-03-20T10-24-10.181.csv', '2023-03-02T13-21-08.706.csv', '2023-03-28T13-40-57.551.csv', '2023-03-07T08-18-44.199.csv', '2023-0

In [28]:
#extract the pose csv data into an array of dataframes
datasets = []
countempty = 0
for file in filenames:
    filestring = "./" + camDir + "/" + file
    df = pd.read_csv(filestring)
    if not df.empty:
        datasets.append(pd.read_csv(filestring))
    else:
        countempty += 1

print('dataframes: ' + str(len(datasets)))
print('empty: ' + str(countempty))
datasets[0].head()

dataframes: 4347
empty: 4790


Unnamed: 0,time,gro1,gro2,gro3,gro4,v1,v2,v3,k0x,k0y,...,k30z,k31x,k31y,k31z,k32x,k32y,k32z,k33x,k33y,k33z
0,"2023, 3, 27, 12, 36, 56, 345000",-0.04086,0.06853,0.02914,0.99639,0.0,0.0,0.0,-2.87311,1.07641,...,2.51401,-2.74403,0.29736,2.41246,-3.01445,2.01651,2.34384,-2.81666,2.02701,2.31605
1,"2023, 3, 27, 12, 36, 56, 628000",-0.03755,-0.12673,0.04727,0.9901,0.30759,0.009,0.55558,-2.86248,1.01433,...,2.78723,-2.68455,0.30574,2.70248,-2.98753,1.94945,2.53108,-2.80477,1.96902,2.57802
2,"2023, 3, 27, 12, 36, 56, 878000",0.02186,-0.08084,0.02318,0.99622,-0.10668,0.03785,1.06937,-2.99292,1.00219,...,3.14777,-2.86948,0.30428,3.06341,-2.95535,1.91865,3.19235,-2.83838,1.93412,3.2314
3,"2023, 3, 27, 12, 36, 57, 112000",-0.00407,-0.02937,0.01156,0.99949,-0.16979,0.04376,1.18522,-3.04484,0.98282,...,3.48804,-2.95345,0.27299,3.39688,-3.16118,1.90233,3.33504,-2.81333,1.88617,3.50001
4,"2023, 3, 27, 12, 36, 57, 345000",0.01406,-0.02888,0.04719,0.99837,-0.12861,0.0437,1.23544,-3.04811,0.99091,...,3.78979,-2.90685,0.29106,3.70057,-3.2452,1.88739,3.7437,-3.03402,1.87379,4.01315


In [29]:
cnt = 0
for data in datasets:
    cnt += len(data)

print(cnt)

220986


Creating the Ground Truth and preparing Machine Learning

In [30]:
truths = []
ones = 0
zeros = 0
counter = 0
#iterate through all dataframes
for data in datasets:
    #extract the first column which contains the time
    dates = data.loc[:,"time"]
    #if date is unreadable, set it to an impossible date
    try:
        date = dates[0].replace(" ", "").split(',')[0:3]
        year = date[0]
        month = date[1]
        if len(month) < 2:
            month = '0' + month
        day = date[2]
        if len(day) < 2:
            day = '0' + day
    except:
        year = 0
        month = 0
        day = 0
        pass

    #create the string of the associated activity csv from the extracted time
    filestring = './' + actDir + '/activities-' + year + '-' + month + '-' + day + '.csv'

    labels = []
    #open the corresponding file if it exists
    try:
        timestamps = pd.read_csv(filestring)
    except:
        timestamps = pd.DataFrame()
        pass

    #iterate through all dates in the time column of the dataframe
    for date in dates:
        #extract the time of day in seconds of a given frame
        try:
            time = date.replace(" ", "").split(',')[3:6]
            hour = int(time[0])
            minute = int(time[1])
            second = int(time[2])
            total = hour * 3600 + minute * 60 + second
        except:
            #if time is unreadable, set it outside of possible range
            counter += 1
            total = 100000
            pass

        #set the label to 1 if the time is within the specified range (in seconds) of an activity, else leave it at 0
        found = 0
        for timestamp in timestamps:
            if found == 0:
                ts = timestamp.split('T')[1].split('.')[0]
                ts_h = int(ts.split(':')[0])
                ts_m = int(ts.split(':')[1])
                ts_s = int(ts.split(':')[2])
                ts_total = ts_h * 3600 + ts_m * 60 + ts_s

                if ts_total + range > total > ts_total - range:
                    found = 1
                    labels.append(1)
                    ones += 1

        if found == 0:
            labels.append(0)
            zeros += 1

    #create a list of lists that contains the labels to each dataframe
    truths.append(labels)

print(truths)
print('nr of 0s: ' + str(zeros))
print('nr of 1s: ' + str(ones))
print('nr of unreadable dates (set to 0): ' + str(counter))

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [38]:
for i,list in enumerate(truths): 
    summe = sum(list)
    if filenames[i] == '2023-03-24T08-33-34.364.csv':
        print(i)
     

3916


In [32]:
#concatenate all dataframes to one single dataframe
megatable = datasets[0]
for data in datasets[1:]:
    megatable = pd.concat([megatable, data], ignore_index=True)

#concatenate all the label lists to one big list that corresponds row for row with the dataframe
labels = []
for truth in truths:
    labels += truth

print(megatable)
print(labels)
megatable.info()
print('length of y: ' + str(len(labels)))

                                    time     gro1     gro2     gro3     gro4   
0        2023, 3, 27, 12, 36, 56, 345000 -0.04086  0.06853  0.02914  0.99639  \
1        2023, 3, 27, 12, 36, 56, 628000 -0.03755 -0.12673  0.04727  0.99010   
2        2023, 3, 27, 12, 36, 56, 878000  0.02186 -0.08084  0.02318  0.99622   
3        2023, 3, 27, 12, 36, 57, 112000 -0.00407 -0.02937  0.01156  0.99949   
4        2023, 3, 27, 12, 36, 57, 345000  0.01406 -0.02888  0.04719  0.99837   
...                                  ...      ...      ...      ...      ...   
220981      2023, 3, 9, 5, 54, 46, 29000 -0.02069 -0.13441  0.02641  0.99242   
220982     2023, 3, 9, 5, 54, 46, 295000 -0.01945 -0.08759  0.02577  0.99768   
220983     2023, 3, 9, 5, 54, 46, 545000 -0.01843 -0.04443  0.02492  1.00057   
220984     2023, 3, 9, 5, 54, 46, 812000 -0.01749  0.00230  0.02407  1.00159   
220985      2023, 3, 9, 5, 54, 47, 29000 -0.01679  0.03827  0.02343  1.00089   

             v1       v2       v3      

Machine Learning

In [33]:
#split the time column from the megatable
if 'time' in megatable:
    dates = megatable.loc[:,"time"]
    del megatable['time']

#convert megatable and labels into numpy arrays X and y
X = megatable.to_numpy()
y = np.asarray(labels)
print(X.shape)
print(y.shape)

(220986, 109)
(220986,)


In [37]:
# Define model
model = LinearRegression()
# Fit model
save = model.fit(X, y)

<bound method RegressorMixin.score of LinearRegression()>

In [63]:
df_test = datasets[3916]
#del df_test['time']
X_test = df_test.to_numpy()

erg = save.predict(X_test)
hilfsnum = 0 

for num in erg:
    if num > hilfsnum:
        hilfsnum = num

for i, num in enumerate(erg): 
    erg[i] = num / hilfsnum

for num in erg:
    if num > hilfsnum:
        hilfsnum = num

print(erg)
print(hilfsnum)



[ 0.21034037 -0.12813251 -0.05448659 -0.09634995  0.7420223   0.72952984
  0.76986389  0.71333514  0.7051316   0.73043901  0.71061998  0.75280524
  0.73340578  0.70104761  0.67426515  0.61655906  0.71011535  0.70032286
  0.25535968  0.18645033  0.02408801 -0.03367513  0.01639069  0.01389623
 -0.02599066  0.05205301  0.10872453  0.16261319  0.16250413  0.2029248
  0.17705945  0.10888672  0.15634966  0.06654603  0.15002855  0.12408154
  0.21287053  0.12542282  0.06858042  0.02910648  0.04524152  0.04212584
 -0.00987315  0.02123417  0.04379299  0.02466143 -0.03792923  0.04046995
 -0.06188156 -0.08229832 -0.02497696 -0.06929742 -0.08283399 -0.03275161
 -0.07691839  0.00463081  0.03604221  0.04713599  0.00434502  0.0045496
 -0.0037566   0.0275096   0.02219485  0.11589457  0.21258682  0.14145258
  0.20503801  0.28814502  0.30642314  0.11024869 -0.08410484  0.05538105
  0.15820766  0.29380899  0.37478827  0.35410147  0.37877813  0.30367023
  0.31489188  0.2413004   0.13117103  0.08910481  0.1