# Analysis of Single-Point Test Data

## Reading in Data

### Import Code

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import statistics

In [None]:
df1 = pd.read_excel("FM Radio Signal Strength run 1-5.xlsx")
df2 = pd.read_excel("FM Radio Signal Strength run 6-10.xlsx")
df2edit = df2.drop('Frequency', axis=1)

df3 = pd.read_excel("FM Radio Signal Strength run 11-20.xlsx")
df3edit = df3.drop('Frequency', axis=1)

data_frames = [df1, df2edit, df3edit]
all_data_unsorted = pd.concat(data_frames, axis=1)
test_df_unsorted = all_data_unsorted.copy()
test_df_unsorted.fillna(0.0)
final_test_df = test_df_unsorted.groupby(test_df_unsorted.Frequency).sum()
cols = final_test_df.columns
final_test_df[cols] = final_test_df[cols].replace({0.0:np.nan})
all_data = final_test_df
all_data.head(25)

### Import Functions

In [None]:
def read_In_Files_Excel(fileList):
    '''Returns a dataframe made by reading in all files 
    in a list and combines their data together
    fileList must be a list of filenames as strings and the files
    must share a common first column'''
    dfbase = pd.read_excel(fileList[0])
    joint_col = dfbase.columns[0]
    
    if len(fileList) == 1:
        return dfbase
    else:
        df_List = [dfbase]
    for i in range(1,len(fileList)):
        dfadd = pd.read_excel(fileList[i])
        dfadd_edit = dfadd.drop(joint_col, axis=1)
        df_List.append(dfadd_edit)
        
    dfbase_new = pd.concat(df_List, axis=1)
    
    return dfbase_new

#all_data_unsorted = read_In_Files_Excel(["FM Radio Signal Strength run 1-5.xlsx", "FM Radio Signal Strength run 6-10.xlsx", "FM Radio Signal Strength run 11-20.xlsx"])
#all_data_unsorted.head(25)

In [None]:
def clean_dataframe(data):
    '''Returns a dataframe with all frequency lines
    consolidated into one line per frequency with
    no repeats, and all data is still present'''
    df_unsorted = data.copy()
    joint_col = df_unsorted.columns[0]
    df_unsorted.fillna(0.0)
    df_sorted = data.groupby(df_unsorted[joint_col]).sum()
    df_sorted_edit = df_sorted.drop('Frequency', axis=1)
    cols = df_sorted_edit.columns
    df_sorted_edit[cols] = df_sorted_edit[cols].replace({0.0:np.nan})
    return df_sorted_edit

#all_data = clean_dataframe(all_data_unsorted)
#all_data.head(25)

In [None]:
def read_and_clean_data(fileList):
    '''Reads in all the files entered and cleans
    up the data so as many nans as possible are 
    removed'''
    all_data_unsorted = read_In_Files_Excel(fileList)
    all_data = clean_dataframe(all_data_unsorted)
    return all_data

file_List = ["FM Radio Signal Strength run 1-5.xlsx", "FM Radio Signal Strength run 6-10.xlsx", "FM Radio Signal Strength run 11-20.xlsx"]
all_data = read_and_clean_data(file_List)
all_data.head(26)

## Data Exploration

### Data Analysis Code

In [None]:
# strongest signal per run
strong_List = []
df_dim = all_data.shape

for i in range(df_dim[1]):
    max_signal = 0
    max_signal_loc = 0
    for j in range(df_dim[0]):
        current_signal = all_data.iloc[j,i]
        if current_signal > max_signal:
            max_signal = current_signal
            max_signal_loc = all_data.index[j]
    strong_List.append([max_signal_loc, max_signal])
    
print(strong_List)

In [None]:
line_data.groupby("Frequency")["Power"].max()

In [None]:
# weakest signal per run
weak_List = []
df_dim = all_data.shape

for i in range(df_dim[1]):
    min_signal = 110
    min_signal_loc = 0
    for j in range(df_dim[0]):
        current_signal = all_data.iloc[j,i]
        if (current_signal < min_signal) and pd.notna(current_signal):
            min_signal = current_signal
            min_signal_loc = all_data.index[j]
    weak_List.append([min_signal_loc, min_signal])
    # potential for multiple signal magnitudes being the same
print(weak_List)

In [None]:
line_data.groupby("Frequency")["Power"].min() #add/based on conditions, ex. max for conditions 1-4

In [None]:
# average signal strength
avg_List = []
df_dim = all_data.shape

for j in range(df_dim[1]):
    sum_signal = 0
    num_signal = 0
    for i in range(df_dim[0]):
        if pd.notna(all_data.iloc[i,j]):
            sum_signal = sum_signal + all_data.iloc[i,j]
            num_signal = num_signal + 1
    avg_signal = sum_signal/num_signal
    avg_List.append(avg_signal)
    
print(avg_List)

In [None]:
line_data.groupby("Frequency")["Power"].mean()

In [None]:
# best run per frequency
values_df = all_data.copy()
freq_max_list = values_df.max(axis=1, skipna=True)
row_List = []

for i in range(len(all_data)):
    row_List.append(all_data.index[i])

freqList = []
valueList = []

for i in range(len(data)):
    freq_value = freq_max_list[values_df.index[i]]
    freq = row_List[i]
    if freq in freqList:
        former_case = freqList.index(freq)
        if freq_value > valueList[former_case]:
            valueList.remove(valueList[former_case])
            freqList.remove(freq)
            valueList.append(freq_value)
            freqList.append(freq)
    else:
        valueList.append(freq_value)
        freqList.append(freq)
        
print(freqList)
print(valueList)

### Data Analysis Functions

In [None]:
def run_strongest_signal(data, run_num):
    '''Returns the strongest signal power for the
    provided run if the run is in the data and 
    a notice that the run is not in the data if
    applicable'''
    strong_signal_List = []
    
    df_dim = all_data.shape

    for i in range(df_dim[1]):
        max_signal = 0
        for j in range(df_dim[0]):
            current_signal = all_data.iloc[j,i]
            if current_signal > max_signal:
                max_signal = current_signal
        strong_signal_List.append(max_signal)
    
    if run_num > len(strong_signal_List):
        return print('This run is not in the data.')
    else:
        return print('The maximum signal power for this run is:', strong_signal_List[run_num - 1], 'dBm.')

run_strongest_signal(all_data, 10)

In [None]:
def run_weakest_signal(data, run_num):
    '''Returns the weakest signal power for the
    provided run if the run is in the data and 
    a notice that the run is not in the data if
    applicable'''
    weak_signal_List = []
    
    df_dim = all_data.shape

    for i in range(df_dim[1]):
        min_signal = 110
        for j in range(df_dim[0]):
            current_signal = all_data.iloc[j,i]
            if current_signal < min_signal:
                min_signal = current_signal
        weak_signal_List.append(min_signal)
    
    if run_num > len(weak_signal_List):
        return print('This run is not in the data.')
    else:
        return print('The minimum signal power for this run is:', weak_signal_List[run_num - 1], 'dBm.')

run_weakest_signal(all_data, 6)

In [None]:
def run_avg_signal(data, run_num):
    '''Returns the average signal power for the
    provided run if the run is in the data and 
    a notice that the run is not in the data if
    applicable'''
    avg_signals = []
    df_dim = all_data.shape

    for j in range(df_dim[1]):
        sum_signal = 0
        num_signal = 0
        for i in range(df_dim[0]):
            if pd.notna(all_data.iloc[i,j]):
                sum_signal = sum_signal + all_data.iloc[i,j]
                num_signal = num_signal + 1
        avg_signal = sum_signal/num_signal
        avg_signals.append(avg_signal)
        
    if run_num > len(avg_signals):
        return print('This run is not in the data.')
    else:
        return print('The average signal power for this run is:', avg_signals[run_num - 1], 'dBm.')
    
run_avg_signal(all_data, 11)

In [None]:
# find maximum signal for the provided frequency in data
def max_signal_per_freq(data, frequency):
    '''Returns the maximum signal power for the input frequency
    if frequency is present and a notice of absence if not'''
    values_df = data.copy()
    freq_max_list = values_df.max(axis=1, skipna=True)
    row_List = []
    
    for i in range(len(all_data)):
        row_List.append(all_data.index[i])
    
    freqList = []
    valueList = []
    
    for i in range(len(data)):
        freq_value = freq_max_list[values_df.index[i]]
        freq = row_List[i]
        if freq in freqList:
            former_case = freqList.index(freq)
            if freq_value > valueList[former_case]:
                valueList.remove(valueList[former_case])
                freqList.remove(freq)
                valueList.append(freq_value)
                freqList.append(freq)
        else:
            valueList.append(freq_value)
            freqList.append(freq)
    if frequency in freqList:
        freq_loc = freqList.index(frequency)
        return print('The maximum signal power for this frequency is:', valueList[freq_loc], 'dBm.')
    else:
        return print('This frequency is not in the data.')
    
max_signal_per_freq(all_data, 107.8)

In [None]:
def line_data_max_signal(data, frequency):
    values_df = data.copy()
    maxs = values_df.groupby("Frequency")["Power"].max()
    if frequency in maxs:
        return print('The maximum signal power for this frequency is:', maxs[frequency], 'dBm.')
    else:
        return print('This frequency is not in the data.')
    
line_data_max_signal(line_data, 99.5)

In [None]:
def line_data_min_signal(data, frequency):
    values_df = data.copy()
    mins = values_df.groupby("Frequency")["Power"].min()
    if frequency in mins:
        return print('The minimum signal power for this frequency is:', mins[frequency], 'dBm.')
    else:
        return print('This frequency is not in the data.')
    
line_data_min_signal(line_data, 99.5)

In [None]:
def line_data_mean_signal(data, frequency):
    values_df = data.copy()
    means = values_df.groupby("Frequency")["Power"].mean()
    if frequency in means:
        return print('The mean signal power for this frequency is:', means[frequency], 'dBm.')
    else:
        return print('This frequency is not in the data.')
    
line_data_mean_signal(line_data, 99.5)

### Graphing Data

In [None]:
# code put data in a form for graphing without gaps in it
def plotData(data):
    '''Returns the input dataframe with columns 
    indicating which cells to use to graph in order 
    to account for the nans present'''
    plot_data = data.copy()
    
    run1use = np.isfinite(all_data['Run 1'])
    plot_data.insert(20, "run1use", run1use)
    
    run2use = np.isfinite(all_data['Run 2'])
    plot_data.insert(21, "run2use", run2use)
    
    run3use = np.isfinite(all_data['Run 3'])
    plot_data.insert(22, "run3use", run3use)
    
    run4use = np.isfinite(all_data['Run 4'])
    plot_data.insert(23, "run4use", run4use)
    
    run5use = np.isfinite(all_data['Run 5'])
    plot_data.insert(24, "run5use", run5use)
    
    run6use = np.isfinite(all_data['Run 6'])
    plot_data.insert(25, "run6use", run6use)
    
    run7use = np.isfinite(all_data['Run 7'])
    plot_data.insert(26, "run7use", run7use)
    
    run8use = np.isfinite(all_data[' Run 8'])
    plot_data.insert(27, "run8use", run8use)
    
    run9use = np.isfinite(all_data['Run 9'])
    plot_data.insert(28, "run9use", run9use)
    
    run10use = np.isfinite(all_data['Run 10'])
    plot_data.insert(29, "run10use", run10use)
    
    run11use = np.isfinite(all_data['Run 11'])
    plot_data.insert(30, "run11use", run11use)
    
    run12use = np.isfinite(all_data['Run 12'])
    plot_data.insert(31, "run12use", run12use)
    
    run13use = np.isfinite(all_data['Run 13'])
    plot_data.insert(32, "run13use", run13use)
    
    run14use = np.isfinite(all_data['Run 14'])
    plot_data.insert(33, "run14use", run14use)
    
    run15use = np.isfinite(all_data['Run 15'])
    plot_data.insert(34, "run15use", run15use)
    
    run16use = np.isfinite(all_data['Run 16'])
    plot_data.insert(35, "run16use", run16use)
    
    run17use = np.isfinite(all_data['Run 17'])
    plot_data.insert(36, "run17use", run17use)
    
    run18use = np.isfinite(all_data['Run 18'])
    plot_data.insert(37, "run18use", run18use)
    
    run19use = np.isfinite(all_data['Run 19'])
    plot_data.insert(38, "run19use", run19use)
    
    run20use = np.isfinite(all_data['Run 20'])
    plot_data.insert(39, "run20use", run20use)
    
    return plot_data

In [None]:
# dist 5 miles, 10% humidity plot

def plot_runs_1to5(data):
    '''Plots the first data set of the dataframe'''
    fig, ax = plt.subplots()
    run_List = ['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run1use],plot_data['Run 1'][plot_data.run1use])
    ax.scatter(data.index[plot_data.run2use],plot_data['Run 2'][plot_data.run2use])
    ax.scatter(data.index[plot_data.run3use],plot_data['Run 3'][plot_data.run3use])
    ax.scatter(data.index[plot_data.run4use],plot_data['Run 4'][plot_data.run4use])
    ax.scatter(data.index[plot_data.run5use],plot_data['Run 5'][plot_data.run5use])
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (10% Humidity, 5 miles)')
    plt.legend(run_List)
    plt.show()


plot_runs_1to5(all_data)

In [None]:
# dist 10 miles, 10% humidity plot

def plot_runs_6to10(data):
    '''Plots the second data set of the dataframe'''
    fig, ax = plt.subplots()
    run_List = ['Run 6', 'Run 7', 'Run 8', 'Run 9', 'Run 10']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run6use],plot_data['Run 6'][plot_data.run6use])
    ax.scatter(data.index[plot_data.run7use],plot_data['Run 7'][plot_data.run7use])
    ax.scatter(data.index[plot_data.run8use],plot_data[' Run 8'][plot_data.run8use])
    ax.scatter(data.index[plot_data.run9use],plot_data['Run 9'][plot_data.run9use])
    ax.scatter(data.index[plot_data.run10use],plot_data['Run 10'][plot_data.run10use])
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (10% Humidity, 10 miles)')
    plt.legend(run_List)
    plt.show()


#plot_runs_6to10(all_data)

In [None]:
# dist 5 miles, 90% humidity plot

def plot_runs_11to15(data):
    '''Plots the third data set of the dataframe'''
    fig, ax = plt.subplots()
    run_List = ['Run 11', 'Run 12', 'Run 13', 'Run 14', 'Run 15']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run11use],plot_data['Run 11'][plot_data.run11use])
    ax.scatter(data.index[plot_data.run12use],plot_data['Run 12'][plot_data.run12use])
    ax.scatter(data.index[plot_data.run13use],plot_data['Run 13'][plot_data.run13use])
    ax.scatter(data.index[plot_data.run14use],plot_data['Run 14'][plot_data.run14use])
    ax.scatter(data.index[plot_data.run15use],plot_data['Run 15'][plot_data.run15use])
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (90% Humidity, 5 miles)')
    plt.legend(run_List)
    plt.show()


#plot_runs_11to15(all_data)

In [None]:
# dist 10 miles, 90% humidity plot

def plot_runs_16to20(data):
    '''Plots the fourth data set of the dataframe'''
    fig, ax = plt.subplots()
    run_List = ['Run 16', 'Run 17', 'Run 18', 'Run 19', 'Run 20']
    runs_List = ['run16use', 'run17use', 'run18use', 'run19use', 'run20use']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data[runs_List[0]]],plot_data['Run 16'][plot_data[runs_List[0]]])
    ax.scatter(data.index[plot_data[runs_List[1]]],plot_data['Run 17'][plot_data[runs_List[1]]])
    ax.scatter(data.index[plot_data[runs_List[2]]],plot_data['Run 18'][plot_data[runs_List[2]]])
    ax.scatter(data.index[plot_data[runs_List[3]]],plot_data['Run 19'][plot_data[runs_List[3]]])
    ax.scatter(data.index[plot_data[runs_List[4]]],plot_data['Run 20'][plot_data[runs_List[4]]])
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (90% Humidity, 10 miles)')
    plt.legend(run_List)
    plt.show()


#plot_runs_16to20(all_data)

In [None]:
def plot_by_groups(data):
    '''Plots all the data on graphs by the 
    set the data is a part of'''
    plot_runs_1to5(data)
    plot_runs_6to10(data)
    plot_runs_11to15(data)
    plot_runs_16to20(data)
    
plot_by_groups(all_data)

In [None]:
def plot_runs_1to20(data):
    '''Plots all the data on one graph'''
    fig, ax = plt.subplots()
    run_List = ['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5', 'Run 6', 'Run 7', 
                'Run 8', 'Run 9', 'Run 10', 'Run 11', 'Run 12', 'Run 13', 'Run 14', 
                'Run 15', 'Run 16', 'Run 17', 'Run 18', 'Run 19', 'Run 20']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run1use],plot_data['Run 1'][plot_data.run1use], color='b')
    ax.scatter(data.index[plot_data.run2use],plot_data['Run 2'][plot_data.run2use], color='b')
    ax.scatter(data.index[plot_data.run3use],plot_data['Run 3'][plot_data.run3use], color='b')
    ax.scatter(data.index[plot_data.run4use],plot_data['Run 4'][plot_data.run4use], color='b')
    ax.scatter(data.index[plot_data.run5use],plot_data['Run 5'][plot_data.run5use], color='b')
    ax.scatter(data.index[plot_data.run6use],plot_data['Run 6'][plot_data.run6use], color='g')
    ax.scatter(data.index[plot_data.run7use],plot_data['Run 7'][plot_data.run7use], color='g')
    ax.scatter(data.index[plot_data.run8use],plot_data[' Run 8'][plot_data.run8use], color='g')
    ax.scatter(data.index[plot_data.run9use],plot_data['Run 9'][plot_data.run9use], color='g')
    ax.scatter(data.index[plot_data.run10use],plot_data['Run 10'][plot_data.run10use], color='g')
    ax.scatter(data.index[plot_data.run11use],plot_data['Run 11'][plot_data.run11use], color='r')
    ax.scatter(data.index[plot_data.run12use],plot_data['Run 12'][plot_data.run12use], color='r')
    ax.scatter(data.index[plot_data.run13use],plot_data['Run 13'][plot_data.run13use], color='r')
    ax.scatter(data.index[plot_data.run14use],plot_data['Run 14'][plot_data.run14use], color='r')
    ax.scatter(data.index[plot_data.run15use],plot_data['Run 15'][plot_data.run15use], color='r')
    ax.scatter(data.index[plot_data.run16use],plot_data['Run 16'][plot_data.run16use], color='c')
    ax.scatter(data.index[plot_data.run17use],plot_data['Run 17'][plot_data.run17use], color='c')
    ax.scatter(data.index[plot_data.run18use],plot_data['Run 18'][plot_data.run18use], color='c')
    ax.scatter(data.index[plot_data.run19use],plot_data['Run 19'][plot_data.run19use], color='c')
    ax.scatter(data.index[plot_data.run20use],plot_data['Run 20'][plot_data.run20use], color='c')
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (All Runs)')
    plt.legend(run_List, ncol=4)
    plt.show()

# dashes and dots
plot_runs_1to20(all_data)

In [None]:
# plot the first run of each set of five
def plot_first_runs(data):
    fig, ax = plt.subplots()
    run_List = ['Run 1', 'Run 6', 'Run 11', 'Run 16']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run1use],plot_data['Run 1'][plot_data.run1use], color='b')
    ax.scatter(data.index[plot_data.run6use],plot_data['Run 6'][plot_data.run6use], color='g')
    ax.scatter(data.index[plot_data.run11use],plot_data['Run 11'][plot_data.run11use], color='r')
    ax.scatter(data.index[plot_data.run16use],plot_data['Run 16'][plot_data.run16use], color='m')
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (Runs 1, 6, 11, and 16)')
    plt.legend(run_List, ncol=4)
    plt.show()


plot_first_runs(all_data)

In [None]:
# plot the second run of each set of five
def plot_second_runs(data):
    fig, ax = plt.subplots()
    run_List = ['Run 2', 'Run 7', 'Run 12', 'Run 17']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run2use],plot_data['Run 2'][plot_data.run2use], color='b')
    ax.scatter(data.index[plot_data.run7use],plot_data['Run 7'][plot_data.run7use], color='g')
    ax.scatter(data.index[plot_data.run12use],plot_data['Run 12'][plot_data.run12use], color='r')
    ax.scatter(data.index[plot_data.run17use],plot_data['Run 17'][plot_data.run17use], color='m')
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (Runs 2, 7, 12, and 17)')
    plt.legend(run_List, ncol=4)
    plt.show()


plot_second_runs(all_data)

In [None]:
# plot the third run of each set of five
def plot_third_runs(data):
    fig, ax = plt.subplots()
    run_List = ['Run 3', 'Run 8', 'Run 13', 'Run 18']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run3use],plot_data['Run 3'][plot_data.run3use], color='b')
    ax.scatter(data.index[plot_data.run8use],plot_data[' Run 8'][plot_data.run8use], color='g')
    ax.scatter(data.index[plot_data.run13use],plot_data['Run 13'][plot_data.run13use], color='r')
    ax.scatter(data.index[plot_data.run18use],plot_data['Run 18'][plot_data.run18use], color='m')
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (Runs 3, 8, 13, and 18)')
    plt.legend(run_List, ncol=4)
    plt.show()


plot_third_runs(all_data)

In [None]:
# plot the fourth run of each set of five
def plot_fourth_runs(data):
    fig, ax = plt.subplots()
    run_List = ['Run 4', 'Run 9', 'Run 14', 'Run 19']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run4use],plot_data['Run 4'][plot_data.run4use], color='b')
    ax.scatter(data.index[plot_data.run9use],plot_data['Run 9'][plot_data.run9use], color='g')
    ax.scatter(data.index[plot_data.run14use],plot_data['Run 14'][plot_data.run14use], color='r')
    ax.scatter(data.index[plot_data.run19use],plot_data['Run 19'][plot_data.run19use], color='m')
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (Runs 4, 9, 14, and 19)')
    plt.legend(run_List, ncol=4)
    plt.show()


plot_fourth_runs(all_data)

In [None]:
# plot the fifth run of each set of five
def plot_fifth_runs(data):
    fig, ax = plt.subplots()
    run_List = ['Run 5', 'Run 10', 'Run 15', 'Run 20']
    fig.set_size_inches(18.5, 10.5)
    plot_data = plotData(data)
    ax.scatter(data.index[plot_data.run5use],plot_data['Run 5'][plot_data.run5use], color='b')
    ax.scatter(data.index[plot_data.run10use],plot_data['Run 10'][plot_data.run10use], color='g')
    ax.scatter(data.index[plot_data.run15use],plot_data['Run 15'][plot_data.run15use], color='r')
    ax.scatter(data.index[plot_data.run20use],plot_data['Run 20'][plot_data.run20use], color='m')
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (Runs 5, 10, 15, and 20)')
    plt.legend(run_List, ncol=4)
    plt.show()


plot_fifth_runs(all_data)

In [None]:
# plot using line data
def determine_color_4groups(data, list_groups):
    # ex. list_groups = [[5, 10], [10, 10], [5,90], [10,90]]
    color_List = []
    for i in range(len(data.Frequency)):
        if (data.Distance[i] == list_groups[0][0]) and (data.Humidity[i] == list_groups[0][1]):
            color_List.append('b')
        elif (data.Distance[i] == list_groups[1][0]) and (data.Humidity[i] == list_groups[1][1]):
            color_List.append('g')
        elif (data.Distance[i] == list_groups[2][0]) and (data.Humidity[i] == list_groups[2][1]):
            color_List.append('r')
        elif (data.Distance[i] == list_groups[3][0]) and (data.Humidity[i] == list_groups[3][1]):
            color_List.append('y')
    
    return color_List


In [None]:
import matplotlib.patches as mpatches

def line_data_scatter(data, groups_List, runList, run_Color):
    plot_data = data.copy()
    legendList = []
    for i in range(len(runList)):
        legendList.append(mpatches.Patch(color=run_Color[i], label=runList[i]))
    
    color_List = determine_color_4groups(plot_data, groups_List)
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5)
    ax.scatter(plot_data.Frequency, plot_data.Power, color=color_List)
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength (All Runs)')
    plt.legend(handles=legendList)
    plt.show()
    
test_List = [[5, 10], [10, 10], [5,90], [10,90]]
run_List = ['Run 1-5', 'Run 6-10', 'Run 11-15', 'Run 16-20']
run_color = ['b', 'g', 'r', 'y']
line_data_scatter(line_data, test_List, run_List, run_color)

In [None]:
# interaction plots
# distributions
# histogram colors based on which run of the group it is
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
data = all_data.copy()
run_List = ['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5', 'Run 6', 'Run 7', 
            'Run 8', 'Run 9', 'Run 10', 'Run 11', 'Run 12', 'Run 13', 'Run 14', 
            'Run 15', 'Run 16', 'Run 17', 'Run 18', 'Run 19', 'Run 20']
color_List = ['b', 'g', 'r', 'c', 'y']*4
ax.hist([data['Run 1'], data['Run 2'], data['Run 3'], data['Run 4'], 
          data['Run 5'], data['Run 6'], data['Run 7'], data[' Run 8'], 
          data['Run 9'], data['Run 10'], data['Run 11'], data['Run 12'], 
          data['Run 13'], data['Run 14'], data['Run 15'], data['Run 16'], 
          data['Run 17'], data['Run 18'], data['Run 19'], data['Run 20']], 
          bins='auto', color= color_List)
ax.set_xlabel('Signal Power')
ax.set_ylabel('Number of Occurrences')
fig.suptitle('Signal Power Histogram')
plt.legend(run_List, ncol=4)
plt.show()

In [None]:
# histogram colors based on conditions during run
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
data = all_data.copy()
run_List = ['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5', 'Run 6', 'Run 7', 
            'Run 8', 'Run 9', 'Run 10', 'Run 11', 'Run 12', 'Run 13', 'Run 14', 
            'Run 15', 'Run 16', 'Run 17', 'Run 18', 'Run 19', 'Run 20']
color_List = ['b', 'b', 'b', 'b', 'b', 'g', 'g', 'g', 'g', 'g', 
              'r', 'r', 'r', 'r', 'r', 'c', 'c', 'c', 'c', 'c']
ax.hist([data['Run 1'], data['Run 2'], data['Run 3'], data['Run 4'], 
          data['Run 5'], data['Run 6'], data['Run 7'], data[' Run 8'], 
          data['Run 9'], data['Run 10'], data['Run 11'], data['Run 12'], 
          data['Run 13'], data['Run 14'], data['Run 15'], data['Run 16'], 
          data['Run 17'], data['Run 18'], data['Run 19'], data['Run 20']], 
          bins='auto', color= color_List)
ax.set_xlabel('Signal Power')
ax.set_ylabel('Number of Occurrences')
fig.suptitle('Signal Power Histogram')
plt.legend(run_List, ncol=4)
plt.show()

In [None]:
def histogram_Plot(data):
    '''Plots a histogram from the data given, with
    colors dependent on which set of data it is from'''
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5)
    hist_data = data.copy()
    
    run_List = hist_data.columns
    
    color_List = ['b', 'b', 'b', 'b', 'b', 'g', 'g', 'g', 'g', 'g', 
                  'r', 'r', 'r', 'r', 'r', 'c', 'c', 'c', 'c', 'c']
    
    ax.hist([data['Run 1'], data['Run 2'], data['Run 3'], data['Run 4'], 
             data['Run 5'], data['Run 6'], data['Run 7'], data[' Run 8'], 
             data['Run 9'], data['Run 10'], data['Run 11'], data['Run 12'], 
             data['Run 13'], data['Run 14'], data['Run 15'], data['Run 16'], 
             data['Run 17'], data['Run 18'], data['Run 19'], data['Run 20']], 
            bins='auto', color= color_List)
    ax.set_xlabel('Signal Power (dBm)')
    ax.set_ylabel('Number of Occurrences')
    fig.suptitle('Signal Power Histogram')
    plt.legend(run_List, ncol=4)
    plt.show()
histogram_Plot(all_data)
print(all_data.columns)

### Generalized Plotting

In [None]:
def valid_points(data, run_name):
    run_use = np.isfinite(all_data[run_name])
    
    return run_use

In [None]:
def valid_points_groupby(data, run_name):
    

In [None]:
def plot_runs_var(data, runList, colorList):
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5)
    
    for i in range(len(runList)):
        plot_data = data.copy()
        run_use = valid_points(all_data, runList[i])
        plot_data.insert(1, "run_use", run_use)
        ax.scatter(plot_data.index[plot_data.run_use],plot_data[runList[i]][plot_data.run_use], color=colorList[i])
        
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength')
    plt.legend(runList)
    plt.show()

run_List = ['Run 11', 'Run 5', 'Run 18', 'Run 2', 'Run 7']
color_List = ['b', 'r', 'g', 'c', 'm']

plot_runs_var(all_data, run_List, color_List)

In [None]:
def histogram_Plot_var(data, groupList):
    '''Plots a histogram from the data given, with
    colors dependent on which set of data it is from, 
    currently only works for groups of 5'''
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5)
    hist_data = data.copy()
    
    run_List = hist_data.columns
    
    color_List = ['b', 'b', 'b', 'b', 'b', 'g', 'g', 'g', 'g', 'g', 
                  'r', 'r', 'r', 'r', 'r', 'c', 'c', 'c', 'c', 'c']
    

    for i in groupList:
        ax.hist([hist_data[run_List[i-1]], hist_data[run_List[i]], hist_data[run_List[i+1]],
                 hist_data[run_List[i+2]], hist_data[run_List[i+3]]], bins=90, color=color_List[i-1:i+4])
    
    ax.set_xlabel('Signal Power (dBm)')
    ax.set_ylabel('Number of Occurrences')
    fig.suptitle('Signal Power Histogram')
    plt.legend(run_List, ncol=4)
    plt.show()
    
group_List = [1, 6, 11, 16]
histogram_Plot_var(all_data, group_List)

In [None]:
def multiple_histogram_Plot_var(data, groupList, run_cond):
    '''Plots a histogram from the data given, with
    colors dependent on which set of data it is from, 
    currently only works for groups of 5'''
    
    hist_data = data.copy()
    
    run_List = hist_data.columns
    
    color_List = ['b', 'b', 'b', 'b', 'b', 'g', 'g', 'g', 'g', 'g', 
                  'r', 'r', 'r', 'r', 'r', 'c', 'c', 'c', 'c', 'c']
    
    run_count = 0
    for i in groupList:
        fig, ax = plt.subplots()
        fig.set_size_inches(18.5, 10.5)
        ax.hist([hist_data[run_List[i-1]], hist_data[run_List[i]], hist_data[run_List[i+1]],
                 hist_data[run_List[i+2]], hist_data[run_List[i+3]]], bins='auto', color=color_List[i-1:i+4])
        ax.set_xlabel('Signal Power (dBm)')
        ax.set_ylabel('Number of Occurrences')
        fig.suptitle('Signal Power Histogram: ' + run_cond[run_count])
        plt.legend(run_List[i-1:i+4])
        plt.show()
        run_count = run_count + 1
    
group_List = [1, 6, 11, 16]
runCond = ['5 Miles, 10% Humidity', '10 Miles, 10% Humidity', '5 Miles, 90% Humidity', '10 Miles, 90% Humidity']
multiple_histogram_Plot_var(all_data, group_List, runCond)

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
data = all_data.copy()
run_List = data.columns

color_List = ['b', 'b', 'b', 'b', 'b', 'g', 'g', 'g', 'g', 'g', 
              'r', 'r', 'r', 'r', 'r', 'c', 'c', 'c', 'c', 'c']
ax.hist([data[run_List]], bins='auto')  #, color= color_List)
ax.set_xlabel('Signal Power')
ax.set_ylabel('Number of Occurrences')
fig.suptitle('Signal Power Histogram')
#plt.legend(run_List, ncol=4)
plt.show()

### Interaction Plots

In [None]:
sum_List = all_data.sum()
count_List = all_data.count()

mean_group_1 = (sum_List[0] + sum_List[1] + sum_List[2] + sum_List[3] + sum_List[4])/(count_List[0] + count_List[1] + count_List[2] + count_List[3] + count_List[4])
mean_group_2 = (sum_List[5] + sum_List[6] + sum_List[7] + sum_List[8] + sum_List[9])/(count_List[5] + count_List[6] + count_List[7] + count_List[8] + count_List[9])
mean_group_3 = (sum_List[10] + sum_List[11] + sum_List[12] + sum_List[13] + sum_List[14])/(count_List[10] + count_List[11] + count_List[12] + count_List[13] + count_List[14])
mean_group_4 = (sum_List[15] + sum_List[16] + sum_List[17] + sum_List[18] + sum_List[19])/(count_List[15] + count_List[16] + count_List[17] + count_List[18] + count_List[19])

xvalues = [0.0,1.0]
line1y = [mean_group_1, mean_group_2]
line2y = [mean_group_3, mean_group_4]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xvalues, line1y, label='10% Humidity')
ax.plot(xvalues, line2y, label='90% Humidity')
ax.set_xticks((0, 1.))
ax.set_xticklabels(('Near', 'Far'))
ax.set_xbound((-.2, 1.2))
ax.set_xlabel("Distance")
ax.legend()
plt.ylabel('Mean Signal Strength')
plt.title('Signal Power Interaction Plot')
plt.show()

In [None]:
xvalues = [0.0,1.0]
line1y = [mean_group_1, mean_group_3]
line2y = [mean_group_2, mean_group_4]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xvalues, line1y, label='5 Miles')
ax.plot(xvalues, line2y, label='10 Miles')
ax.set_xticks((0, 1.))
ax.set_xticklabels(('Low', 'High'))
ax.set_xbound((-.2, 1.2))
ax.set_xlabel("Humidity")
ax.legend()
plt.ylabel('Mean Signal Strength')
plt.title('Signal Power Interaction Plot')
plt.show()

In [None]:
def interaction_Plots(data):
    '''Makes interaction plots of humidity and distance'''
    sum_List = data.sum()
    count_List = data.count()
    
    mean_group_1 = (sum_List[0] + sum_List[1] + sum_List[2] + sum_List[3] + sum_List[4])/(count_List[0] + count_List[1] + count_List[2] + count_List[3] + count_List[4])
    mean_group_2 = (sum_List[5] + sum_List[6] + sum_List[7] + sum_List[8] + sum_List[9])/(count_List[5] + count_List[6] + count_List[7] + count_List[8] + count_List[9])
    mean_group_3 = (sum_List[10] + sum_List[11] + sum_List[12] + sum_List[13] + sum_List[14])/(count_List[10] + count_List[11] + count_List[12] + count_List[13] + count_List[14])
    mean_group_4 = (sum_List[15] + sum_List[16] + sum_List[17] + sum_List[18] + sum_List[19])/(count_List[15] + count_List[16] + count_List[17] + count_List[18] + count_List[19])
    
    xvalues = [0.0,1.0]
    line1y = [mean_group_1, mean_group_2]
    line2y = [mean_group_3, mean_group_4]
    line3y = [mean_group_1, mean_group_3]
    line4y = [mean_group_2, mean_group_4]
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(xvalues, line1y, label='10% Humidity')
    ax.plot(xvalues, line2y, label='90% Humidity')
    ax.set_xticks((0, 1.))
    ax.set_xticklabels(('Near', 'Far'))
    ax.set_xbound((-.2, 1.2))
    ax.set_xlabel("Distance")
    ax.legend()
    plt.ylabel('Mean Signal Strength')
    plt.title('Signal Power Interaction Plot')
    plt.show()
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(xvalues, line3y, label='5 Miles')
    ax.plot(xvalues, line4y, label='10 Miles')
    ax.set_xticks((0, 1.))
    ax.set_xticklabels(('Low', 'High'))
    ax.set_xbound((-.2, 1.2))
    ax.set_xlabel("Humidity")
    ax.legend()
    plt.ylabel('Mean Signal Strength')
    plt.title('Signal Power Interaction Plot')
    plt.show()
    
interaction_Plots(all_data)

In [None]:
def interaction_Plots_var(data, groupLists, labelList):
    '''Makes interaction plots of the groups of data, requires a list of 4 lists to work'''
    sum_List = data.sum()
    count_List = data.count()
    mean_List = []
    
    for i in range(len(groupLists)):
        sum_val = 0
        count_val = 0
        for j in range(len(groupLists[i])):
            sum_val = sum_val + sum_List[groupLists[i][j]-1]
            count_val = count_val + count_List[groupLists[i][j]-1]
        average_val = sum_val/count_val
        mean_List.append(average_val)
    
    # mean_List should have 4 values in it at this point
    xvalues = [0.0,1.0]
    line1y = [mean_List[0], mean_List[1]]
    line2y = [mean_List[2], mean_List[3]]
    line3y = [mean_List[0], mean_List[2]]
    line4y = [mean_List[1], mean_List[3]]
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(xvalues, line1y, label=labelList[2])
    ax.plot(xvalues, line2y, label=labelList[3])
    ax.set_xticks((0, 1.))
    ax.set_xticklabels((labelList[4], labelList[5]))
    ax.set_xbound((-.2, 1.2))
    ax.set_xlabel(labelList[0])
    ax.legend()
    plt.ylabel(labelList[1])
    plt.title(labelList[6])
    plt.show()
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(xvalues, line3y, label=labelList[8])
    ax.plot(xvalues, line4y, label=labelList[9])
    ax.set_xticks((0, 1.))
    ax.set_xticklabels((labelList[10], labelList[11]))
    ax.set_xbound((-.2, 1.2))
    ax.set_xlabel(labelList[7])
    ax.legend()
    plt.ylabel(labelList[1])
    plt.title(labelList[6])
    plt.show()

group_List = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20]]
label_List = ['Distance', 'Mean Signal Strength', '10% Humidity', '90% Humidity', 'Near', 'Far', 'Signal Power Interaction Plot', 'Humidity', '5 Miles', '10 Miles', 'Low', 'High']

interaction_Plots_var(all_data, group_List, label_List)

In [None]:
def interaction_Plot_Labels():
    '''Returns a list of labels for an interaction plot'''
    label_List = []
    
    plot1_xlabel = input('What is the x label for the first graph? ')
    label_List.append(plot1_xlabel)
    
    plots_ylabel = input('What is the y label for the graphs? ')
    label_List.append(plots_ylabel)
    
    plot1_line1 = input('What does the first line of the first graph represent? ')
    label_List.append(plot1_line1)
    
    plot1_line2 = input('What does the second line of the first graph represent? ')
    label_List.append(plot1_line2)
    
    plot1_x1 = input('What is the first x position for the first graph? ')
    label_List.append(plot1_x1)
    
    plot1_x2 = input('What is the second x position for the first graph? ')
    label_List.append(plot1_x2)
    
    plot_title = input('What is the title for the plots? ')
    label_List.append(plot_title)
    
    plot2_xlabel = input('What is the x label for the second graph? ')
    label_List.append(plot2_xlabel)
    
    plot2_line1 = input('What does the first line of the second graph represent? ')
    label_List.append(plot2_line1)
    
    plot2_line2 = input('What does the second line of the second graph represent? ')
    label_List.append(plot2_line2)
    
    plot2_x1 = input('What is the first x position for the second graph? ')
    label_List.append(plot2_x1)
    
    plot2_x2 = input('What is the second x position for the second graph? ')
    label_List.append(plot2_x2)
    
    return label_List

labelList = interaction_Plot_Labels()
print(labelList)

### Confidence Interval

In [None]:
def mean_confidence_interval(data, confidence=0.95):
    '''Finds the confidence intervals to 95% of a set of data'''
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.nanmean(a), stats.sem(a, nan_policy='omit')
    h = se * stats.t.ppf((1 + confidence) / 2., n-1)
    return h

confid_int = mean_confidence_interval(all_data)
print(confid_int)

In [None]:
def plot_runs_var_with_confid_int(data, runList, colorList):
    '''Plots data with the respective confidence intervals'''
    fig, ax = plt.subplots()
    fig.set_size_inches(18.5, 10.5)
    confid_int = mean_confidence_interval(all_data)
    match_List = list(data.columns)
    
    for i in range(len(runList)):
        plot_data = data.copy()
        run_use = valid_points(all_data, runList[i])
        plot_data.insert(1, "run_use", run_use)
        match_num = match_List.index(runList[i])
        ax.scatter(plot_data.index[plot_data.run_use],plot_data[runList[i]][plot_data.run_use], color=colorList[i])
        ax.errorbar(plot_data.index[plot_data.run_use], plot_data[runList[i]][plot_data.run_use], yerr = confid_int[match_num], ecolor=colorList[i], linestyle="None")
        
    ax.set_xlabel('Frequency (kHz)')
    ax.set_ylabel('Signal Strength (dBm)')
    fig.suptitle('Frequency vs Signal Strength')
    plt.legend(runList)
    plt.show()

run_List = ['Run 11', 'Run 5', 'Run 18', 'Run 2', 'Run 7']
color_List = ['b', 'r', 'g', 'c', 'm']

plot_runs_var_with_confid_int(all_data, run_List, color_List)

### One-Way ANOVA code

In [None]:
def ANOVA_oneway_func(data, compare_List):
    '''Return an ANOVA one-way evaluation of two data sets (currently only works on two specific sets at a time)'''
    # to not mix in nans
    use_data = data.copy()
    run_use1 = valid_points(all_data, compare_List[0])
    use_data.insert(1, "run_use1", run_use1)
    run_use2 = valid_points(all_data, compare_List[1])
    use_data.insert(2, "run_use2", run_use2)
    
    f_val, p_val = stats.f_oneway(data[compare_List[0]][use_data.run_use1], data[compare_List[1]][use_data.run_use2])
    
    # if the p value is less than 0.05, there is significant statistical difference
    if p_val < 0.05:
        return print('We reject null hypothesis for ' + compare_List[0] + ' and ' + compare_List[1] + ' due to significant statistical differences shown by a p-value of ' + str(p_val) +'.')
    else:
        return print('We fail to reject null hypothesis for ' + compare_List[0] + ' and ' + compare_List[1] + ' due to lacking significant statistical differences shown by a p-value of ' + str(p_val) +'.')

# result is as expected
compareList = ['Run 1', 'Run 16']
ANOVA_oneway_func(all_data, compareList)

In [None]:
# confirms that humidity has a lesser impact than distance
compareList = ['Run 1', 'Run 11']
ANOVA_oneway_func(all_data, compareList)

In [None]:
# recognizes that the two are part of the same set
compareList = ['Run 1', 'Run 2']
ANOVA_oneway_func(all_data, compareList)

In [None]:
# result is as expected
compareList = ['Run 1', 'Run 6']
ANOVA_oneway_func(all_data, compareList)

In [None]:
def ANOVA_oneway_func_multi_runs(data, compare_List):
    # to not mix in nans
    group_a = []
    group_b = []
    
    for i in compare_List[0]:
        for j in data[i]:
            if pd.notna(j):
                group_a.append(j)
    
    for i in compare_List[1]:
        for j in data[i]:
            if pd.notna(j):
                group_b.append(j)
                
    
    f_val, p_val = stats.f_oneway(group_a, group_b)
    
    # if the p value is less than 0.05, there is significant statistical difference
    if p_val < 0.05:
        return print('We reject null hypothesis for the two groups of data due to significant statistical differences shown by a p-value of ' + str(p_val) +'.')
    else:
        return print('We fail to reject null hypothesis for for the two groups of data due to lacking significant statistical differences shown by a p-value of ' + str(p_val) +'.')

# result is as expected
compareList = [['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5'], ['Run 16', 'Run 17', 'Run 18', 'Run 19', 'Run 20']]
ANOVA_oneway_func_multi_runs(all_data, compareList)

In [None]:
compareList = [['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5'], ['Run 11', 'Run 12', 'Run 13', 'Run 14', 'Run 15']]
ANOVA_oneway_func_multi_runs(all_data, compareList)

In [None]:
compareList = [['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5'], ['Run 6', 'Run 7', ' Run 8', 'Run 9', 'Run 10']]
ANOVA_oneway_func_multi_runs(all_data, compareList)

### Two-Way ANOVA code

In [None]:
def ANOVA_twoway_func(data, group_Lists_a, group_Lists_b):
    
    group_a_1 = []
    for i in group_Lists_a[0]:
        for j in data[i]:
            if pd.notna(j):
                group_a_1.append(j)
    
    group_a_2 = []
    for i in group_Lists_a[1]:
        for j in data[i]:
            if pd.notna(j):
                group_a_2.append(j)
    group_b_1 = []
    for i in group_Lists_b[0]:
        for j in data[i]:
            if pd.notna(j):
                group_b_1.append(j)
    
    group_b_2 = []
    for i in group_Lists_b[1]:
        for j in data[i]:
            if pd.notna(j):
                group_b_2.append(j)
    
    group_sum = 0
    group_count = 0
    for i in group_Lists_a[0]:
        if i in group_Lists_b[0]:
            for j in data[i]:
                if pd.notna(j):
                    group_sum = group_sum + j
                    group_count = group_count + 1
    group1_mean = group_sum/group_count
    
    group_sum = 0
    group_count = 0
    for i in group_Lists_a[0]:
        if i in group_Lists_b[1]:
            for j in data[i]:
                if pd.notna(j):
                    group_sum = group_sum + j
                    group_count = group_count + 1
    group2_mean = group_sum/group_count
    
    group_sum = 0
    group_count = 0
    for i in group_Lists_a[1]:
        if i in group_Lists_b[0]:
            for j in data[i]:
                if pd.notna(j):
                    group_sum = group_sum + j
                    group_count = group_count + 1
    group3_mean = group_sum/group_count
    
    group_sum = 0
    group_count = 0
    for i in group_Lists_a[1]:
        if i in group_Lists_b[1]:
            for j in data[i]:
                if pd.notna(j):
                    group_sum = group_sum + j
                    group_count = group_count + 1
    group4_mean = group_sum/group_count
     
    df_a = len(group_Lists_a) - 1
    df_b = len(group_Lists_b) - 1
    df_axb = df_a * df_b
    df_w = (len(group_a_1) + len(group_a_2) + len(group_b_1) + len(group_b_2)) - ((df_a+1)*(df_b+1))
    
    mean_total = (sum(group_a_1) + sum(group_a_2) + sum(group_b_1) + sum(group_b_2))/(len(group_a_1) + len(group_a_2) + len(group_b_1) + len(group_b_2))
    print(mean_total)
    ss_total = 0
    for i in range(0, len(group_a_1)): #all groups should be the same length
        ss_total = ss_total + (group_a_1[i] - mean_total)**2 + (group_a_2[i] - mean_total)**2
    
    ss_a = ((sum(group_a_1)/len(group_a_1)) - mean_total)**2 + ((sum(group_a_2)/len(group_a_2)) - mean_total)**2
    print(ss_total)
    ss_b = ((sum(group_b_1)/len(group_b_1)) - mean_total)**2 + ((sum(group_b_2)/len(group_b_2)) - mean_total)**2
    ss_axb = (group1_mean - (sum(group_a_1)/len(group_a_1)) - (sum(group_b_1)/len(group_b_1)) + mean_total)**2 + (group2_mean - (sum(group_a_1)/len(group_a_1)) - (sum(group_b_2)/len(group_b_2)) + mean_total)**2 + (group3_mean - (sum(group_a_2)/len(group_a_2)) - (sum(group_b_1)/len(group_b_1)) + mean_total)**2 + (group4_mean - (sum(group_a_2)/len(group_a_2)) - (sum(group_b_2)/len(group_b_2)) + mean_total)**2
    ss_w = ss_total - ss_a - ss_b - ss_axb
    
    MS_a = ss_a/df_a #before here
    MS_b = ss_b/df_b
    MS_axb = ss_axb/df_axb
    MS_w = ss_w/df_w
    
    f_val_a = MS_a/MS_w
    f_val_b = MS_b/MS_w
    f_val_axb = MS_axb/MS_w
    
    p_a = stats.f.sf(f_val_a, df_a, df_w)
    p_b = stats.f.sf(f_val_b, df_b, df_w)
    p_axb = stats.f.sf(f_val_axb, df_axb, df_w)
    
    
    return p_a, p_b, p_axb
groupLists1 = [['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5', 'Run 6', 'Run 7', ' Run 8', 'Run 9', 'Run 10'], ['Run 11', 'Run 12', 'Run 13', 'Run 14', 'Run 15', 'Run 16', 'Run 17', 'Run 18', 'Run 19', 'Run 20']]
groupLists2 = [['Run 1', 'Run 2', 'Run 3', 'Run 4', 'Run 5', 'Run 11', 'Run 12', 'Run 13', 'Run 14', 'Run 15'], ['Run 6', 'Run 7', ' Run 8', 'Run 9', 'Run 10', 'Run 16', 'Run 17', 'Run 18', 'Run 19', 'Run 20']]

p_a_val, p_b_val, p_axb_val = ANOVA_twoway_func(all_data, groupLists1, groupLists2)
print(p_a_val)
print(p_b_val)
print(p_axb_val)

In [None]:
all_data.sum(axis=0,skipna=True)

In [None]:
run_sum = all_data.sum(axis=0,skipna=True)
print(sum(run_sum))

In [None]:
def ANOVA_twoway_func_Line(data):
    N = len(data.Power)
    df_a = len(data.Distance.unique()) - 1
    df_b = len(data.Humidity.unique()) - 1
    df_axb = df_a*df_b 
    df_w = N - (len(data.Distance.unique())*len(data.Humidity.unique()))
    
    grand_mean = data['Power'].mean()
    
    ssq_a = sum([(data[data.Distance ==l].Power.mean()-grand_mean)**2 for l in data.Distance])
    ssq_b = sum([(data[data.Humidity ==l].Power.mean()-grand_mean)**2 for l in data.Humidity])
    ssq_t = sum((data.Power - grand_mean)**2)
    
    short_dist = data[data.Distance == 5]
    long_dist = data[data.Distance == 10]
    short_dist_means = [short_dist[short_dist.Humidity == d].Power.mean() for d in short_dist.Humidity]
    long_dist_means = [long_dist[long_dist.Humidity == d].Power.mean() for d in long_dist.Humidity]
    ssq_w = sum((long_dist.Power - long_dist_means)**2) + sum((short_dist.Power - short_dist_means)**2)
    
    ssq_axb = ssq_t - ssq_a - ssq_b - ssq_w
    
    ms_a = ssq_a/df_a
    ms_b = ssq_b/df_b
    ms_axb = ssq_axb/df_axb
    ms_w = ssq_w/df_w
    
    f_a = ms_a/ms_w
    f_b = ms_b/ms_w
    f_axb = ms_axb/ms_w
    
    p_a = stats.f.sf(f_a, df_a, df_w)
    p_b = stats.f.sf(f_b, df_b, df_w)
    p_axb = stats.f.sf(f_axb, df_axb, df_w)
    
    return p_a, p_b, p_axb
    
line_data = pd.read_excel("FM Radio Signal Strength dra.xlsx")
p_a_val, p_b_val, p_axb_val = ANOVA_twoway_func_Line(line_data)
print(p_a_val)
print(p_b_val)
print(p_axb_val)

In [None]:
def json_input():
    cont_file = True
    data = {}
    fileName = input('What is the name of the json file? ')
    section_num = int(input('How many key/value pairs are there? '))
    
    for i in range(0, section_num):
        key_type = input('Is the key a integer or string? (int/str) ')
        if key_type == 'int':
            key_name = int(input('Input key: '))
        elif key_type == 'str':
            key_name = str(input('Input key: '))
            
        value_type = input('Is the value a single item or a list? (Single/List) ')
        if value_type == 'Single':
            type_value = input('Is the key a integer, list, or string? (int/lst/str) ')
            if type_value == 'int':
                value_item = int(input('What is the value for the key? '))
            elif type_value == 'lst':
                elements_num = int(input('How long is this list? '))
                value_item = []
                for elem_num in range(0, elements_num):
                    value_item_type = input('Element type? (int/str) ')
                    if value_item_type == 'int':
                        value_item.append(int(input('Element in value list? ')))
                    elif value_item_type == 'str':
                        value_item.append(str(input('Element in value list? ')))
            elif type_value == 'str':
                value_item = str(input('What is the value for this key? '))
            data[key_name] = value_item
            
        if value_type == 'List':
            value_List = []
            list_len = int(input('How many dictionaries are in the list? '))
            for j in range(0, list_len):
                list_dict = {}
                dict_len = int(input('How many key/value pairs are there in this dictionary? '))
                for k in range(0, dict_len):
                    key_type_list = input('Is the key a integer or string? (int/str) ')
                    if key_type_list == 'int':
                        key_name_list = int(input('Input key: '))
                    elif key_type_list == 'str':
                        key_name_list = str(input('Input key: '))
                    type_value = input('Is the key a integer, list, or string? (int/lst/str) ')
                    if type_value == 'int':
                        key_value = int(input('What is the value for this key? '))
                    elif type_value == 'lst':
                        key_value = [int(x) for x in input('What are the values for the key? (Values should be separated by commas)').split(",")]
                    elif type_value == 'str':
                        key_value = str(input('What is the value for this key? '))
                    list_dict[key_name_list] = key_value
                value_List.append(list_dict)
            data[key_name] = value_List
        
    with open(fileName, 'w') as outfile:
        json.dump(data, outfile, indent=4, sort_keys=True, separators=(', ',':'))

        
json_input()

In [None]:
#python input forms
x = [int(x) for x in input("Enter multiple value: ").split(",")] 

In [None]:
# function for choosing data type

def input_type(item):
    try:
        int_item = int(item)
        return int_item
    except ValueError:
        return item

print(type(input_type('three')))
print(type(input_type('3')))

In [None]:
def json_input_ver2():
    cont_file = True
    data = {}
    fileName = input('What is the name of the json file? ')
    section_num = int(input('How many key/value pairs are there? '))
    
    for i in range(section_num):
        key_input = input('What is the key?')
        key_final = input_type(key_input)
        value_type = input('Is the value a single item or a list? (Single/List) ')
        if value_type == 'Single':
            type_value = input('Is the value a single item or a list? (Single/List) ')
            if type_value == 'Single':
                value_input = input('What is the value? ')
                value_final = input_type(value_input)
            elif type_value == 'List':
                elements_num = int(input('How long is this list? '))
                for elem_num in range(elements_num):
                    value_final = [input_type(x) for x in input('What are the values for the key? (Values should be separated by commas) ').split(",")]
            data[key_final] = value_final
            
        if value_type == 'List':
            value_List = []
            list_len = int(input('How many dictionaries are in the list? '))
            for j in range(list_len):
                list_dict = {}
                dict_len = int(input('How many key/value pairs are there in this dictionary? '))
                for k in range(dict_len):
                    list_key_input = input('What is the key?')
                    list_key_final = input_type(list_key_input)
                    type_value = input('Is the value a single item or a list? (Single/List) ')
                    if type_value == 'Single':
                        list_value_input = input_type(input('What is the value for this key? '))
                        list_value_final = input_type(list_value_input)
                    elif type_value == 'List':
                        list_value_final = [input_type(x) for x in input('What are the values for the key? (Values should be separated by commas) ').split(",")]
                    list_dict[list_key_final] = list_value_final
                value_List.append(list_dict)
            data[key_final] = value_List
        
    with open(fileName, 'w') as outfile:
        json.dump(data, outfile, indent=4, sort_keys=True, separators=(', ',':'))

        
json_input_ver2()

In [36]:
#k-nearest; origin: https://www.dataquest.io/blog/k-nearest-neighbors-in-python/
import math
import pandas
with open("nba_2013.csv", 'r') as csvfile:
    nba_df = pandas.read_csv(csvfile)
print(nba_df.columns.values) # The names of all the columns in the data.

# Select Lebron James from our dataset
selected_player = nba_df[nba_df["player"] == "LeBron James"].iloc[0]

# Choose only the numeric columns (we'll use these to compute euclidean distance)
distance_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts']

#clearing NA values
nba = nba_df.dropna()

def euclidean_distance(row):
    """
    A simple euclidean distance function
    """
    inner_value = 0
    for k in distance_columns:
        inner_value += (row[k] - selected_player[k]) ** 2
    return math.sqrt(inner_value)

# Find the distance from each player in the dataset to lebron.
lebron_distance = nba.apply(euclidean_distance, axis=1)

# Select only the numeric columns from the NBA dataset
nba_numeric = nba[distance_columns]
# Normalize all of the numeric columns
nba_normalized = (nba_numeric - nba_numeric.mean()) / nba_numeric.std()

from scipy.spatial import distance

# Fill in NA values in nba_normalized
nba_normalized.fillna(0, inplace=True)

# Find the normalized vector for lebron james.
lebron_normalized = nba_normalized[nba["player"] == "LeBron James"]

# Find the distance between lebron james and everyone else.
euclidean_distances = nba_normalized.apply(lambda row: distance.euclidean(row, lebron_normalized), axis=1)

# Create a new dataframe with distances.
distance_frame = pandas.DataFrame(data={"dist": euclidean_distances, "idx": euclidean_distances.index})
distance_frame.sort_values(by=["dist"], inplace=True)
# Find the most similar player to lebron (the lowest distance to lebron is lebron, the second smallest is the most similar non-lebron player)
second_smallest = distance_frame.iloc[1]["idx"]
most_similar_to_lebron = nba.loc[int(second_smallest)]["player"]

import random
from numpy.random import permutation

# Randomly shuffle the index of nba.
random_indices = permutation(nba.index)

# Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
test_cutoff = math.floor(len(nba)/3)

# Generate the test set by taking the first 1/3 of the randomly shuffled indices.
test = nba.loc[random_indices[1:test_cutoff]]

# Generate the train set with the rest of the data.
train = nba.loc[random_indices[test_cutoff:]]

# The columns that we will be making predictions with.
x_columns = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict.
y_column = ["pts"]

from sklearn.neighbors import KNeighborsRegressor
# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsRegressor(n_neighbors=5)
# Fit the model on the training data.
knn.fit(train[x_columns], train[y_column])
# Make point predictions on the test set using the fit model.
predictions = knn.predict(test[x_columns])

# Get the actual values for the test set.
actual = test[y_column]

# Compute the mean squared error of our predictions.
mse = (((predictions - actual) ** 2).sum()) / len(predictions)

#print predicted values
predictions_List = [item for sublist in predictions for item in sublist]
print('Predicted values (pts): ')
print(*predictions_List, sep=", ")

#print actual values
actual_List = actual.to_string(header=False,index=False,index_names=False).split(sep='\n ')
actual_print = []
for element in actual_List:
    actual_print.append(element.strip())

print('Actual values (pts): ')
print(*actual_print, sep=", ")

#print the mean squared error
print('Mean squared error: ', float(mse))

['player' 'pos' 'age' 'bref_team_id' 'g' 'gs' 'mp' 'fg' 'fga' 'fg.' 'x3p'
 'x3pa' 'x3p.' 'x2p' 'x2pa' 'x2p.' 'efg.' 'ft' 'fta' 'ft.' 'orb' 'drb'
 'trb' 'ast' 'stl' 'blk' 'tov' 'pf' 'pts' 'season' 'season_end']
Predicted values (pts): 
994.8, 1920.6, 768.0, 84.8, 1294.8, 589.4, 1140.8, 87.0, 639.4, 297.0, 694.6, 514.6, 1028.4, 900.8, 807.6, 252.6, 179.2, 1043.2, 31.6, 355.0, 406.6, 893.0, 79.6, 595.0, 186.6, 279.8, 11.6, 169.0, 420.0, 784.4, 704.2, 56.4, 555.6, 407.2, 313.0, 576.2, 294.0, 1609.0, 502.4, 1369.8, 817.0, 994.0, 1096.6, 156.8, 439.2, 814.6, 12.6, 1274.0, 532.2, 786.0, 551.4, 275.4, 635.8, 1270.8, 869.0, 172.0, 1042.0, 551.6, 1513.4, 76.6, 368.2, 760.6, 84.0, 334.4, 395.0, 1070.4, 18.2, 474.6, 818.2, 33.6, 203.8, 696.4, 438.6, 571.4, 319.8, 826.2, 235.6, 1048.2, 116.0, 1403.4, 367.8, 195.8, 914.0, 59.0, 619.2, 619.8, 68.0, 407.0, 269.4, 15.2, 420.0, 966.0, 802.2, 766.4, 598.6, 27.4, 457.0, 129.0, 1046.6, 251.0, 784.2, 99.4, 849.4, 356.2, 548.6, 1146.0, 130.8, 446.4, 571.0, 1

In [37]:
def knearest_analysis(data, x_columns, y_column):
    import random
    from numpy.random import permutation
    from sklearn.neighbors import KNeighborsRegressor
    
    # Randomly shuffle the index of nba.
    random_indices = permutation(data.index)
    
    # Set a cutoff for how many items we want in the test set (in this case 1/3 of the items)
    test_cutoff = math.floor(len(data)/3)
    
    # Generate the test set by taking the first 1/3 of the randomly shuffled indices.
    test = data.loc[random_indices[1:test_cutoff]]
    
    # Generate the train set with the rest of the data.
    train = data.loc[random_indices[test_cutoff:]]
    
    # Create the knn model.
    # Look at the five closest neighbors.
    knn = KNeighborsRegressor(n_neighbors=5)
    # Fit the model on the training data.
    knn.fit(train[x_columns], train[y_column])
    # Make point predictions on the test set using the fit model.
    predictions = knn.predict(test[x_columns])
    # Get the actual values for the test set.
    actual = test[y_column]
    
    # Compute the mean squared error of our predictions.
    mse = (((predictions - actual) ** 2).sum()) / len(predictions)
    
    #print predicted values
    predictions_List = [item for sublist in predictions for item in sublist]
    print('Predicted values: ')
    print(*predictions_List, sep=", ")
    
    #print actual values
    actual_List = actual.to_string(header=False,index=False,index_names=False).split(sep='\n ')
    actual_print = []
    for element in actual_List:
        actual_print.append(element.strip())
        
    print('Actual values: ')
    print(*actual_print, sep=", ")
    
    #print the mean squared error
    print('Mean squared error: ', float(mse))
    
x_columns_test = ['age', 'g', 'gs', 'mp', 'fg', 'fga', 'fg.', 'x3p', 'x3pa', 'x3p.', 'x2p', 'x2pa', 'x2p.', 'efg.', 'ft', 'fta', 'ft.', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf']
# The column that we want to predict.
y_column_test = ["pts"]

knearest_analysis(nba, x_columns_test, y_column_test)

Predicted values: 
1815.8, 1048.6, 676.4, 1000.8, 157.4, 456.2, 725.2, 308.8, 571.2, 1064.6, 120.8, 430.8, 188.6, 965.2, 334.4, 394.8, 918.6, 872.8, 1053.8, 179.6, 1073.8, 1344.6, 11.2, 12.0, 1436.6, 224.6, 101.4, 1090.4, 774.6, 964.2, 756.6, 1263.6, 189.8, 75.6, 344.6, 321.0, 1095.2, 1098.4, 515.8, 835.4, 7.0, 101.4, 1726.8, 331.0, 1204.6, 1085.0, 464.2, 168.0, 6.0, 81.6, 1128.4, 841.4, 7.0, 676.6, 349.6, 289.6, 192.2, 396.4, 1419.4, 226.4, 65.0, 651.0, 302.6, 1323.8, 203.4, 239.8, 613.2, 370.4, 806.2, 559.4, 328.4, 101.6, 191.0, 1356.8, 364.8, 808.4, 654.4, 312.6, 912.8, 179.2, 825.0, 404.6, 7.0, 867.4, 144.2, 25.6, 964.0, 15.8, 560.4, 119.4, 1387.4, 27.0, 189.8, 1092.6, 81.6, 467.4, 433.6, 481.8, 88.2, 417.2, 89.8, 30.0, 1729.4, 92.2, 38.6, 545.4, 1208.6, 361.0, 576.4, 42.6, 582.8, 934.0, 666.6, 651.8, 101.4, 843.2, 651.0, 1101.0, 60.6, 808.4, 553.8, 421.2, 411.2, 1971.4, 997.2, 685.4, 921.0, 1049.8, 55.8, 344.6, 851.4, 34.4, 429.4
Actual values: 
1791, 1069, 703, 982, 132, 378, 821

In [None]:
# py qt; requires some hard coding

from PyQt5.QtWidgets import QApplication, QWidget, QInputDialog, QLineEdit

def pyqt_input_col_len_5(num_of_col):
    title_List = []
    all_rows_List = []
    for i in range(num_of_col):
        specific_row_List = []
        col_title = QLineEdit()
        col_title.setAlignment(Qt.AlignRight)
        col_title.setFont(QFont("Arial",20))
        row1 = QLineEdit()
        
        flo = QFormLayout()
        flo.addRow("Column Title", col_title)
        flo.addRow("First Row",row1)
        
        row2 = QLineEdit()
        flo.addRow("Second Row",row2)
        
        row3 = QLineEdit()
        flo.addRow("Third Row",row3)
        
        row4 = QLineEdit()
        flo.addRow("Fourth Row",row4)
        
        row5 = QLineEdit()
        flo.addRow("Fifth Row",row5)
        
        specific_row_List.append(row1)
        specific_row_List.append(row2)
        specific_row_List.append(row3)
        specific_row_List.append(row4)
        specific_row_List.append(row5)
        
        title_List.append(str(col_title))
        all_rows_List.append(specific_row_List)
        
    data = {title_List[0]:all_rows_List[0]}
    df = pd.dataframe(data)
    title_List = title_List[1:]
    all_rows_List = all_rows_List[1:]
    for i in title_List:
        df[title_List[i]] = all_rows_List[0]
        all_rows_List = all_rows_List[1:]
    return df

df = pyqt_input_col_len_5(3)
print(df)