In [1]:
import numpy as np
from datascience import *

In [2]:
# Loads the csv data file for further data cleaning
file_name = "research_data.csv"
researchers = Table.read_table(file_name)
researchers

Name,Interests,Department
Sangwei Lu,pathogenesis~Salmonella~foodborne diseases,School of Public Health
David A. Aaker,business~marketing~branding,Haas School of Business
Pieter Abbeel,robotics~machine learning,Division of Computer Science/EECS
Elizabeth Abel,feminist theory~psychoanalysis~Virginia Woolf~race and g ...,Department of English
Norman Abrahamson,civil and environmental engineering~earthquake ground mo ...,Department of Civil and Environmental Engineering
Dor Abrahamson,mathematical cognition~design-based research~mixed-media ...,Graduate School of Education
Barbara Abrams,obesity~maternal and child health~epidemiology~nutrition ...,School of Public Health
Kathryn Abrams,law~feminist jurisprudence~voting rights~constitutional law,Boalt Hall School of Law
Richard Abrams,politics~recent U.S. history: business foreign relations ...,Department of History
Charisma Acey,water~sanitation~basic services delivery~poverty allevia ...,Department of City & Regional Planning


In [10]:
# Shows a table with the names of researchers and the number of times their
# name appears in this data set
grouped_researchers = researchers.group("Name").sort("Name", descending=False)
grouped_researchers

Name,count
A. James Gregor,8
A. Paul Alivisatos,9
AMANDA TYLER,9
Aaron Fisher,9
Aaron S. Edlin,9
Abby Dernburg,9
Abdul JanMohamed,11
Abena Osseo-Asare,7
Abhishek Kaicker,9
Abigail De Kosnik,11


In [3]:
# This is the cleaned data that is void of duplicate researchers, 
# and the data is alphabetized based on the researcher's name. 
cleaned_researchers = researchers.sort("Name", descending=False).group(
               "Name", min).relabeled("Interests min", "Interests").relabeled(
                "Department min", "Department")
cleaned_researchers

Name,Interests,Department
A. James Gregor,political science~methodology~political theory~comparati ...,Department of Political Science
A. Paul Alivisatos,physical chemistry~semiconductor nanocrystals~nanoscienc ...,Department of Chemistry
AMANDA TYLER,federal courts~separation of powers~habeas corpus~statut ...,Boalt Hall School of Law
Aaron Fisher,Anxiety~depression~personalized medicine~psychotherapy~p ...,Department of Psychology
Aaron S. Edlin,economics~industrial organization~regulation~antitrust,Department of Economics
Abby Dernburg,genomics~chromosome remodeling and reorganization during ...,Department of Molecular & Cell Biology
Abdul JanMohamed,Critical theory; theory of subjection; postcolonial lite ...,Department of English
Abena Osseo-Asare,African history~legal history~medical history~science hi ...,Department of History
Abhishek Kaicker,South Asia~Mughal~early modern~cities~history~Persian,Department of History
Abigail De Kosnik,new media~performance studies~performance theory~perform ...,"Theatre, Dance, and Performance Studies"


In [12]:
# From the grouped_researchers table we initially created to better understand 
# the problem, we can verify if are cleaned_researchers table has the same
# number of researchers as expected after removing duplicates.
grouped_researchers.num_rows == cleaned_researchers.num_rows

True

In [4]:
# Converts the cleaned database as a csv file.
cleaned_researchers.to_df().to_csv('cleaned_researchers.csv', index = False)

In [6]:
# A function that reveals whether a faculty member is associated with different
# departments. 
def tilde_finder(table, column_name): 
    duplicate = []
    for entry in table.column(column_name): 
        if "~" in str(entry): 
            duplicate.append(True)
        else: 
            duplicate.append(False)
    return table.with_column("Duplicate?", duplicate)
if_dupe_table = tilde_finder(cleaned_researchers, "Department")
if_dupe_table

Name,Interests,Department,Duplicate?
A. James Gregor,political science~methodology~political theory~comparati ...,Department of Political Science,False
A. Paul Alivisatos,physical chemistry~semiconductor nanocrystals~nanoscienc ...,Department of Chemistry,False
AMANDA TYLER,federal courts~separation of powers~habeas corpus~statut ...,Boalt Hall School of Law,False
Aaron Fisher,Anxiety~depression~personalized medicine~psychotherapy~p ...,Department of Psychology,False
Aaron S. Edlin,economics~industrial organization~regulation~antitrust,Department of Economics,False
Abby Dernburg,genomics~chromosome remodeling and reorganization during ...,Department of Molecular & Cell Biology,False
Abdul JanMohamed,Critical theory; theory of subjection; postcolonial lite ...,Department of English,False
Abena Osseo-Asare,African history~legal history~medical history~science hi ...,Department of History,False
Abhishek Kaicker,South Asia~Mughal~early modern~cities~history~Persian,Department of History,False
Abigail De Kosnik,new media~performance studies~performance theory~perform ...,"Theatre, Dance, and Performance Studies",False


In [7]:
# Narrows the table to include only faculty members that are duplicated. 
multiple_dept_faculty = if_dupe_table.where("Duplicate?", True)
multiple_dept_faculty

Name,Interests,Department,Duplicate?
Alice M. Agogino,New product development~computer-aided design & data ...,Department of Mechanical Engineering~Haas School of Business,True
Alistair Sinclair,algorithms~applied probability~statistics~random walks~M ...,Department of Statistics~Division of Electrical Engineer ...,True
Andrew F. Stewart,archaeology~classics~Greek sculpture~ancient art and arc ...,Department of Classics~Department of History of Art,True
Anne Nesbet,culture~film studies~Slavic languages~early Soviet cultu ...,Department of Film and Media~Department of Italian Studies,True
Anne-lise Francois,popular culture~English~comparative literature~the moder ...,Department of English~Department of French,True
Anthony J. Cascardi,English~comparative literature~literature~Spanish~Portug ...,Department of Comparative Literature~Department of Spani ...,True
Anton Kaes,film studies~modern literature~literary and cultural the ...,Department of Film and Media~Department of German,True
Aram Thomasian,statistics~electrical engineering~computer science rando ...,Department of Statistics~Division of Electrical Engineer ...,True
Barbara Spackman,feminist theory~psychoanalysis~culture~fascism~gender st ...,Department of Comparative Literature~Department of Itali ...,True
Barrie Thorne,feminist theory~gender theory~ethnography~qualitative me ...,Department of Gender and Women's Studies~Department ...,True


In [8]:
# Returns a cleaned data set as it was originally presented, without the 
# addtional "Duplicate?" column. Now this database contains faculty members 
# that are involved in multiple departments.
cleaned_multiple_dept_faculty = multiple_dept_faculty.drop("Duplicate?")
cleaned_multiple_dept_faculty

Name,Interests,Department
Alice M. Agogino,New product development~computer-aided design & data ...,Department of Mechanical Engineering~Haas School of Business
Alistair Sinclair,algorithms~applied probability~statistics~random walks~M ...,Department of Statistics~Division of Electrical Engineer ...
Andrew F. Stewart,archaeology~classics~Greek sculpture~ancient art and arc ...,Department of Classics~Department of History of Art
Anne Nesbet,culture~film studies~Slavic languages~early Soviet cultu ...,Department of Film and Media~Department of Italian Studies
Anne-lise Francois,popular culture~English~comparative literature~the moder ...,Department of English~Department of French
Anthony J. Cascardi,English~comparative literature~literature~Spanish~Portug ...,Department of Comparative Literature~Department of Spani ...
Anton Kaes,film studies~modern literature~literary and cultural the ...,Department of Film and Media~Department of German
Aram Thomasian,statistics~electrical engineering~computer science rando ...,Department of Statistics~Division of Electrical Engineer ...
Barbara Spackman,feminist theory~psychoanalysis~culture~fascism~gender st ...,Department of Comparative Literature~Department of Itali ...
Barrie Thorne,feminist theory~gender theory~ethnography~qualitative me ...,Department of Gender and Women's Studies~Department ...


In [9]:
# Converts the database that contains faculty involved in multiple departments 
# as a csv file.
cleaned_multiple_dept_faculty.to_df().to_csv('cleaned_multiple_dept_faculty.csv', index = False)