In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# changes directory to the datasets folder
os.chdir("./datasets")
# stores directory to datasets folder to be used later to get the csv files
root = os.getcwd()
# stores files names in current directory (datasets) in a list to be used later get a dataFrame 
files = os.listdir()
# removes hidden file 
files.remove(".DS_Store")
# assign a dictionary to referenced in the future to store dataFrames
datasets = dict()


for item in files:

    # converts csv files to dataFrames
    path = str(f'{root}/{item}')
    df = pd.read_csv(path)

    # gets specific file 
    br = item.split("_")
    targets = br[1:]

    # converts file name from snake case to camel case
    for i in range(len(targets)):
        if i > 0:
            text = targets[i]
            text = text.capitalize()
            targets[i] = text
    target = "".join(targets)
    
    # removes file type from the end of string
    target = target[:-4]

    #stores dataFrames with camel case file name as key
    datasets[target] = df

    # print(targets,target)

# ! there will less datasets when compared to files in dataset folder since the is one instance of datasets spanning multiple questions
# * { files: 30 , datasets: 25}

# changes directory to back to parent directory
os.chdir("../")


In [3]:
logic_df = pd.read_csv('data_.csv')

In [4]:
from Questions import  Question

# stores nested dictionary 
datasets_by_company = dict()

# sorts dataframe by companies
logic_by_company = logic_df.groupby("company_name")

#creates a list of unique company names
companies = list(logic_df["company_name"].unique())

#creates a list of unique question IDs
question_IDs = list(logic_df["question_id"].unique())

#creates a dictionary within another dictionary for referencing questions
for company in companies:
    if company not in datasets_by_company.keys():
        datasets_by_company[company] = dict()

#creates object for each question and inputs them into dictionary
for question_id in question_IDs:
    question_id_df = logic_df[logic_df['question_id'] == question_id]
    title = question_id_df['title'].unique()[0]
    company_name = question_id_df['company_name'].unique()[0]
    prompt = question_id_df['prompt'].unique()[0]
    question_type = question_id_df['question_type'].unique()[0]
    python_difficulty =  question_id_df['python_difficulty'].unique()[0]
    sql_difficulty =  question_id_df['mySQL_difficulty'].unique()[0]
    filesNames = list(question_id_df['file'].unique())

    f = title.split()
    f = map(str.capitalize,f)
    f = "".join(f)
    
    question = Question(question_id,title,prompt,question_type,python_difficulty,sql_difficulty,filesNames,company_name)

    datasets_by_company[company_name][f] = question

In [5]:
# Creates a DataFrame of dictionary
test = dict()
for k in datasets_by_company.keys():
    for k2 in datasets_by_company[k].keys():
        x = datasets_by_company[k][k2]
        y = k2
        test[k2] = vars(x)

data = pd.DataFrame(test).T

# data

In [6]:
companies.sort()

# filter down by company
company_answer = input(f"Enter company name from following \n {companies}")
# company_answer = "Meta/Facebook"

question_from_company = datasets_by_company[company_answer]

In [7]:
question_from_company = list(question_from_company.keys())

#filter down by question
question_title = input(f"Enter question title from follow \n {question_from_company}")
# question_title = "LikedPosts"

question = datasets_by_company[company_answer][question_title]


In [8]:

# shows questions info
print(f"Title: {question.title}")
print(f"Prompt: {question.prompt}")
print(f"Python Difficulty: {question.python_difficulty}")
print(f"SQL Difficulty: {question.sql_difficulty}")

Title: Liked Posts
Prompt: Find the number of posts which were reacted to with a like.
Python Difficulty: medium
SQL Difficulty: medium


In [9]:
from Functions import getDatasetAsDataFrame

#number of rows shown
r = 5

# creates empty df for declaration 
df1 = pd.DataFrame({'A': []})
df2 = pd.DataFrame({'A': []})


# updates the needed df(s) with data
for f in question.fileNames:
    data = getDatasetAsDataFrame(f)
    if df1.empty:
        df1 = data
    elif df2.empty:
        df2 = data

# establish variables for the information of the needed df(s)
data1 = df1.head(r)
column_data1 = df1.dtypes
if df2.empty:
    data2 = ''
    column_data2 = ''
else:
    data2 = df2.head(r)
    column_data2 = df2.dtypes

print(f"Starting DataFrame(s):\n{data1}\n\n{data2}")


Starting DataFrame(s):
   poster  friend reaction  date_day  post_id
0       2       1     like         1        0
1       2       6     like         1        0
2       1       2     like         1        1
3       1       3    heart         1        1
4       1       4     like         1        1

   post_id  poster                                   post_text  \
0        0       2  The Lakers game from last night was great.   
1        1       1                  Lebron James is top class.   
2        2       2                        Asparagus tastes OK.   
3        3       1               Spaghetti is an Italian food.   
4        4       3             User 3 is not sharing interests   

                   post_keywords           post_date  
0        [basketball,lakers,nba]  2019-01-01 0:00:00  
1  [basketball,lebron_james,nba]  2019-01-02 0:00:00  
2               [asparagus,food]  2019-01-01 0:00:00  
3               [spaghetti,food]  2019-01-02 0:00:00  
4                       [#sp

In [10]:
print(f"DataFrame Info:\n{column_data1}\n\n{column_data2}")


DataFrame Info:
poster       int64
friend       int64
reaction    object
date_day     int64
post_id      int64
dtype: object

post_id           int64
poster            int64
post_text        object
post_keywords    object
post_date        object
dtype: object


In [11]:
from Functions import getFinalResult

#get final result
finalResult = getFinalResult(question_title)

q1 = input("Would you like to see final result? (Y/N)")
q1 = str.capitalize(q1)
if(q1 == "Y"):
    print(f"Final Result:\n\n{finalResult}")


In [12]:
from Functions import getPythonSolution, getSQLSolution, getSolutions

#get solution(s) from directory for question
pythonSolution = getPythonSolution(question_title)
sqlSolution = getSQLSolution(question_title)
solutions = getSolutions(question_title)


q2a = input("Would you like to see solution? (Y/N)")
q2a = str.capitalize(q1)
if(q2a == "Y"):
    q2b = input("Select the solution(s) you will like to see (Python, SQL, Both)")
    if(q2b == "Python"):
        print(pythonSolution)
    elif(q2b == "SQL"):
        print(sqlSolution)
    elif(q2b == "Both"):
        print(solutions)
