# In the US from 1999 to 2015, were legal teams featuring greater proportions of female lawyers more or less likely to forum shop in patent law cases? 

# Introduction

Brief introduction
Write a brief introduction of your project (two to four para-
graphs), the source of the data, and the important background necessary to un-
derstand your project (Keep it short. You will complete it over time). An outsider
should be able to understand what you are trying to do in this project. You should
state your data and its source, discuss your research question, and briefly mention
your findings. You should have at least five citations with one of them as your
main paper, and that paper should be the closest paper to your work. The citation
should show up in your references section as well in APA or Chicago style

## Set-up

In [None]:
# ALASKA AND HAWAII: https://medium.com/@alex_44314/use-python-geopandas-to-make-a-us-map-with-alaska-and-hawaii-39a9f5c222c6 

# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import gc
from sklearn.linear_model import LinearRegression
import re

# make head display all columns instead of truncting
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.options.display.max_rows = 100

In [None]:
# use to search up names to insert into gender_nocode dataset
# gender = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/wgnd_2_0_name-gender-code.csv")

Cases data

In [None]:
# call cases data to jupyter notebook
cases = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/cases.csv")
cases = cases.sort_values("case_name", ascending=True)

### prepare the cases dataset for merging

# create year variable
cases["year_filed"] = cases["case_number"].str.split(":").str[1].str[:2].astype(float)
cases.loc[cases.year_filed > 17, "year_filed"] = cases["year_filed"] + 1900
cases.loc[cases.year_filed < 17, "year_filed"] = cases["year_filed"] + 2000
cases["year_filed"] = cases["year_filed"].fillna(0).astype(float)

# these observations don't have a properly formatted case number
cases = cases.drop(index=[0, 1885, 1886])

# create case order variable (indicates 1 more than the number of cases that the court received before that case in that year)
cases["case_order"] = cases["case_number"].str.split(":").str[1]
cases["case_order"] = cases["case_order"].str.split("-").str[2].astype(float)

# create variable showing the courthouse within the district that received the case
cases["courthouse"] = cases["case_number"].str.split(":").str[0]

# standardize the court name variable so that it can be merged
cases["court_name"] = cases["court_name"].str.title()

# adjust these two specific values
cases["court_name"] = cases["court_name"].str.replace("U.S. District Court (Spokane)", "Eastern District Of Washington")
cases["court_name"] = cases["court_name"].str.replace("U.S. District Court (7)", "7th Court of Appeals")

# create region variable
cases.loc[cases.court_name.str.contains("Eastern"), "court_region"] = "Eastern"
cases.loc[cases.court_name.str.contains("Southern"), "court_region"] = "Southern"
cases.loc[cases.court_name.str.contains("Western"), "court_region"] = "Western"
cases.loc[cases.court_name.str.contains("Northern"), "court_region"] = "Northern"
cases["court_region"] = cases["court_region"].fillna("None")

# create state variable
state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District Of Columbia", "Delaware", 
          "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
          "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", 
          "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", 
          "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", 
          "Wisconsin", "West Virginia", "Wyoming", "7th"]

for state in state_names:
    cases.loc[cases.court_name.str.contains(state), "court_state"] = state

# process demand variable to get rid of non-numeric values
cases["demand"] = cases["demand"].astype(str)
cases.loc[False == cases.demand.str.strip().str.isalpha(), "demand_num"] = cases["demand"]
cases["demand_num"] = cases["demand_num"].str.replace("$", "")
cases["demand_num"] = cases["demand_num"].str.replace(",", "")
cases["demand_num"] = cases["demand_num"].astype(float)

cases.loc[cases.demand.str.strip().str.isalpha(), "demand_party"] = cases["demand"]
cases["demand_party"] = cases["demand_party"].astype(str)
cases["demand_party"] = cases["demand_party"].str.replace("p", "Plaintiff")
cases.loc[cases.demand_party == "P", "demand_party"] = str("Plaintiff")
cases.loc[cases.demand_party == "y", "demand_party"] = np.nan
cases.loc[cases.demand_party == "Y", "demand_party"] = np.nan
cases.loc[cases.demand_party == "nan", "demand_party"] = np.nan
# check you only have the values you desire
cases["demand_party"].unique()

# check for missing values
# cases.isna().sum()

In [None]:
# call pacer_cases data to jupyter notebook
pacer_cases = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/pacer_cases.csv")
pacer_cases = pacer_cases.sort_values("case_name", ascending=True)

# prepare the pacer_cases dataset for merging

# construct year variable
pacer_cases["year_filed"] = pacer_cases["case_number"].str.split(":").str[1].str[:4]
pacer_cases.loc[False == pacer_cases["year_filed"].str.isnumeric(), "year_filed"] = pacer_cases["year_filed"].str[:2].astype(float) + 2000
pacer_cases["year_filed"] = pacer_cases["year_filed"].fillna(0).astype(int)
# NOTE: 0 indicates NaN, 56 values

# construct case order (-1 indicates the number of cases that the court received before that case in that year)
pacer_cases["case_order"] = pacer_cases["case_number"].str.split(":").str[1]
pacer_cases["case_order"] = pacer_cases["case_order"].str.split("-").str[2]
pacer_cases.loc[False == pacer_cases["case_order"].str.isnumeric(), "case_order"] = pacer_cases["case_order"].str.split(".").str[0]
pacer_cases.loc[False == pacer_cases["case_order"].str.isnumeric(), "case_order"] = pacer_cases["case_order"].str[:5]
pacer_cases["case_order"] = pacer_cases["case_order"].astype(float)

# indicates the courthouse within the district that received the case
pacer_cases["courthouse"] = pacer_cases["case_number"].str.split(":").str[0]

# standardize the court name variable so that it can be merged
pacer_cases["court_name"] = pacer_cases["court_name"].str.title()
pacer_cases["court_name"] = pacer_cases["court_name"].str.split("(").str[0]

# create region variable
pacer_cases.loc[pacer_cases.court_name.str.contains("Eastern"), "court_region"] = "Eastern"
pacer_cases.loc[pacer_cases.court_name.str.contains("Southern"), "court_region"] = "Southern"
pacer_cases.loc[pacer_cases.court_name.str.contains("Western"), "court_region"] = "Western"
pacer_cases.loc[pacer_cases.court_name.str.contains("Northern"), "court_region"] = "Northern"
pacer_cases["court_region"] = pacer_cases["court_region"].fillna("None")

# create state variable
state_names = ["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District Of Columbia", "Delaware", 
          "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", 
          "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", 
          "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", 
          "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", 
          "Wisconsin", "West Virginia", "Wyoming", "7th"]

for state in state_names:
    pacer_cases.loc[pacer_cases.court_name.str.contains(state), "court_state"] = state

# check for missing values
# pacer_cases.isna().sum()
# pacer_cases[pacer_cases["case_order"].isna()]
# for 76 rows, the pacer dataset doesn't have a valid case number 

In [None]:
# merge
m_cases = pd.merge(cases, pacer_cases, on=["year_filed", "case_order", "courthouse", "court_region", "court_state"], how="left")

# fill in missing values from the cases dataset using the pacer_cases dataset
m_cases["case_name_x"] = m_cases["case_name_x"].fillna(m_cases["case_name_y"])
m_cases["date_filed_x"] = m_cases["date_filed_x"].fillna(m_cases["date_filed_y"])
m_cases["date_closed_x"] = m_cases["date_closed_x"].fillna(m_cases["date_filed_y"])

# check for missing values
m_cases.isna().sum()

# group dataset by whether the year is before or after 1999
m_cases.loc[m_cases.year_filed >= 1999, "post_99"] = 1
m_cases.loc[m_cases.year_filed < 1999, "post_99"] = 0
grouped_cases = m_cases.groupby(by="post_99")

# convert dates to date format
m_cases["date_filed"] = pd.to_datetime(m_cases["date_filed_x"])
m_cases["date_closed"] = pd.to_datetime(m_cases["date_closed_x"])
m_cases["date_last_filed"] = pd.to_datetime(m_cases["date_last_filed"])

# drop superfluous variables
m_cases = m_cases.drop(columns=["date_filed_x", "date_closed_x", "case_name_y", "court_name_y", "date_closed_y", "case_number_y", 
                                "pacer_id_y", "date_filed_y"])

# make a subset of the dataset with just the cases since 1999
cases_99 = m_cases.drop(m_cases[m_cases["post_99"] != 1].index)

# list(cases_99["case_cause"].unique())

# create a binary variable indicating whether a case involves patent infringement
cases_99["case_cause"] = cases_99["case_cause"].str.lower()
cases_99["case_cause"] = cases_99["case_cause"].str.strip()
cases_99.loc[cases_99.case_cause.str.contains("infringement of patent", na=False), "patent_infringement"] = 1
cases_99.loc[cases_99.case_cause.str.contains("patent infringement", na=False), "patent_infringement"] = 1
cases_99["patent_infringement"] = cases_99["patent_infringement"].fillna(0)

list(cases_99["case_cause"].unique())
cases_99["patent_infringement"].value_counts()

# delete superfluous datasets
del [m_cases, cases, pacer_cases]
gc.collect()
m_cases = pd.DataFrame()
cases = pd.DataFrame()
pacer_cases = pd.DataFrame()

Attorneys Data

In [None]:
# call data to jupyter notebook
attorneys = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/attorneys.csv")
gender = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/wgnd_2_0_name-gender_nocode.csv")

# fill (See above for address) using above
attorneys["contactinfo"] = attorneys["contactinfo"].fillna(0)
attorneys["contactinfo"] = attorneys["contactinfo"].str.strip()
attorneys["contactinfo"] = attorneys["contactinfo"].replace('(See above for address)', np.nan)
attorneys["contactinfo"] = attorneys["contactinfo"].ffill()

attorneys["zip"] = attorneys["contactinfo"].str.extract(r"(, [a-zA-Z][a-zA-Z] \d\d\d\d\d)")
attorneys["zip"] = attorneys["zip"].str[5:]
attorneys.head(40)
attorneys[attorneys["zip"].isna()].head(30)
attorneys.isna().sum()

# collapse dataset so that there's only one entry per attorney per case
attorneys = attorneys.groupby(["case_row_id", "name"]).agg("first")
attorneys = attorneys.reset_index()

In [None]:
# extract firm name
attorneys["firm"] = attorneys["contactinfo"].str.split(";").str[0]

# create a 100 row sample
# sample_att = attorneys.sample(n=100)

# examine the sample
# pd.options.display.max_rows = 100
# sample_att[["name", "contactinfo", "firm"]]

In [None]:
# create variable w first name
attorneys["first_name"] = attorneys["name"].str.split().str[0].str.lower().astype(str)

# merge with gender dataset, drop the dataset
attorneys = attorneys.merge(gender, left_on="first_name", right_on="name", how="left")
del [gender]
gc.collect()
gender = pd.DataFrame()

# attorneys.isna().sum()

In [None]:
missing_attorneys = attorneys[attorneys["gender"].isna()]
missing_attorneys = missing_attorneys[missing_attorneys["name_x"].isna() == False]

# merge with dataset with gender names for US
gender = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/wgnd_2_0_name-gender-code_US_likely.csv")
missing_attorneys = missing_attorneys.merge(gender, left_on="first_name", right_on="name",how="left")

# drop variables so you can concat the missing_attorneys dataset with the attorneys dataset
missing_attorneys = missing_attorneys.drop(["name_y", "gender_x"], axis= 1)
missing_attorneys = missing_attorneys.rename(columns={"gender_y": "gender"})

still_missing = missing_attorneys[missing_attorneys["gender"].isna()== True]
still_missing.to_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/still_missing_attorneys.csv", index=False)
# processed these names using https://genderize.io/tools/csv

missing_attorneys = missing_attorneys[missing_attorneys["gender"].isna()== False]
# missing_attorneys.isna().sum()

In [None]:
# drop missing values from the attorneys dataset, add back the missing_attorneys
attorneys = attorneys[attorneys["gender"].isna() == False]
attorneys = attorneys.rename(columns={"name_y": "name"})
attorneys = pd.concat([attorneys, missing_attorneys], axis = 0)

del [missing_attorneys]
gc.collect()
missing_attorneys = pd.DataFrame()

In [None]:
# add back the still_missing_attorneys
still_missing = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/still_missing_attorneys_genderized_copy.csv")

still_missing.loc[still_missing.gender == "male", "gender"] = "M"
still_missing.loc[still_missing.gender == "female", "gender"] = "F"
still_missing.replace("unknown", np.nan)

# still_missing

In [None]:
attorneys = pd.concat([attorneys, still_missing], axis=0)

attorneys.drop(columns=["name"], inplace=True)

# attorneys.isna().sum()

In [None]:
# use to see the possible values for party type
# list(attorneys["party_type"].unique())

# list(attorneys)

In [None]:
# label each attorney either defendent, plantiff, or other
list(attorneys["party_type"].unique())
attorneys["party_type"] = attorneys["party_type"].str.lower()

attorneys.loc[attorneys.party_type.str.contains("defendant"), "defendant"] = 1
attorneys.loc[attorneys.party_type.str.contains("respondent"), "defendant"] = 1
attorneys.loc[attorneys.party_type.str.contains("counter claimant"), "defendant"] = 1
attorneys.loc[attorneys.party_type.str.contains("dft"), "defendant"] = 1
attorneys.loc[attorneys.party_type.str.contains("plaintiff"), "plaintiff"] = 1

attorneys.loc[attorneys.defendant.isna(), "other"] = 1
attorneys.loc[attorneys.plaintiff.isna() == False, "other"] = 0

# create gendered variables for each attorney category
attorneys.loc[(attorneys["defendant"] == 1)&(attorneys["gender"] == "M"), "defendant_M"] = 1
attorneys.loc[(attorneys["defendant"] == 1)&(attorneys["gender"] == "F"), "defendant_F"] = 1
attorneys.loc[(attorneys["plaintiff"] == 1)&(attorneys["gender"] == "M"), "plaintiff_M"] = 1
attorneys.loc[(attorneys["plaintiff"] == 1)&(attorneys["gender"] == "F"), "plaintiff_F"] = 1
attorneys.loc[(attorneys["other"] == 1)&(attorneys["gender"] == "M"), "other_M"] = 1
attorneys.loc[(attorneys["other"] == 1)&(attorneys["gender"] == "F"), "other_F"] = 1

attorneys.loc[attorneys["gender"] == "M", "male"] = 1
attorneys.loc[attorneys["gender"] == "F", "female"] = 1
attorneys["total"] = 1

attorneys[['defendant', 'plaintiff', 'other', 'defendant_M', 'defendant_F', 'plaintiff_M', 'plaintiff_F','other_M',  'other_F', 'male', 
           'female']] = attorneys[['defendant', 'plaintiff', 'other', 'defendant_M', 'defendant_F', 'plaintiff_M', 'plaintiff_F','other_M', 
                                   'other_F', 'male', 'female']].fillna(value=0)

# attorneys.head(20)

In [None]:
# delete unnecessary columns, then try aggregating
c_attorneys = attorneys.drop(columns=["name_x", "case_number", 'party_row_count', 'party_type', 'attorney_row_count', 'contactinfo', 'position', 'zip', 
                                      'firm', 'first_name', "gender"])

In [None]:
list(c_attorneys)

In [None]:
# aggregate to create one row per case
c_attorneys = c_attorneys.groupby("case_row_id").agg({'defendant': "sum",
 'plaintiff': "sum",
 'other': "sum",
 'defendant_M': "sum",
 'defendant_F': "sum",
 'plaintiff_M': "sum",
 'plaintiff_F': "sum",
 'other_M': "sum",
 'other_F': "sum",
 'male': "sum",
 'female': "sum",
 'total': "sum"})

c_attorneys = c_attorneys.fillna(0)
c_attorneys.head(10)

Documents data

In [None]:
# call data to jupyter notebook
documents = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/documents.csv", low_memory=False)

# clean
documents["number_docs"] = 1
doc_agg = documents.groupby("case_row_id").agg("sum")

# doc_agg.head(10)

In [None]:
doc_agg = doc_agg.drop(doc_agg.iloc[:, 0:8], axis=1)

In [None]:
# use for code trying to see types of documents
documents.loc[documents['long_description'].str.contains(r"\bsettle\b", na=False, case=False), "settled"] = 1
# documents[documents["settled"] == 1]

Names data

In [None]:
# call data to jupyter notebook
names = pd.read_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/names.csv")
# names

In [None]:
# view duplicates
# names[names.duplicated(subset=['case_row_id','name'], keep=False)].sort_values(by=["name", "case_row_id"], ascending=True)

In [None]:
# it appears that the party_row_count is a unique identifer of a party in a case, and the name column includes information other than names
# names[names.duplicated(subset=['party_row_count'], keep=False)].sort_values("party_row_count", ascending=True).head(60)

## Merging

In [None]:
# merge to create dataframe with one row per case
df_cases = cases_99.merge(c_attorneys, on="case_row_id", how="left")
df_cases = df_cases.merge(doc_agg, on="case_row_id", how="left")
df_cases.isna().sum()

df_cases.drop(columns=["pacer_id_x", "assigned_to", "referred_to", "case_cause", "jurisdictional_basis", "jury_demand", "lead_case", "related_case", 
                       "settlement", "date_last_filed", "demand_num", "demand", "demand_party", "court_code", "post_99", "date_filed", "date_closed"], inplace=True) 



df_cases.sort_values(by=["case_row_id"], inplace=True)

df_cases = df_cases.rename(columns={"case_number_x": "case_number",
                                "case_name_x": "case_name",
                                "court_name_x": "court_name"
                               })

# df_cases.head(20)

In [None]:
# merge to create df with one row per attorney
df_att = attorneys.merge(df_cases, on="case_row_id", how="inner")
# df_att = df_att.merge(doc_agg, on="case_row_id", how="left")
df_att

In [None]:
# create inhouse attorney dataset

df_att.drop(columns=["total_x", "case_number_y", "party_row_count"], inplace=True) 

# rename columns to ready for merging
df_att = df_att.rename(columns={"case_number_x": "case_number",
                        "name_x": "name_attorney",
                        "defendant_x": "defendant",
                        "plaintiff_x": "plaintiff",
                        "other_x": "other",
                        "defendant_M_x": "defendant_M",
                        "defendant_F_x": "defendant_F",
                        "plaintiff_M_x": "plaintiff_M",	
                        "plaintiff_F_x": "plaintiff_F",
                        "other_M_x": "other_M",
                        "other_F_x": "other_F",
                        "male_x": "male",	
                        "female_x": "female",		
                        "defendant_y": "tot_def",
                        "plaintiff_y": "tot_plain",
                        "witness_y": "tot_wit",
                        "other_y": "tot_other",
                        "defendant_M_y": "tot_def_M",
                        "defendant_F_y": "tot_def_F",
                        "plaintiff_M_y": "tot_plain_M",
                        "plaintiff_F_y": "tot_plain_F",
                        "other_M_y": "tot_other_M",
                        "other_F_y": "tot_other_F",
                        "male_y": "tot_M",
                        "female_y": "tot_F",
                        "total_y": "tot"
                               })

df_att["firm"] = df_att["firm"].str.title()
names["name"] = names["name"].str.title()

for extra in ["Llc", " L.L.C.", " Co ", "Company", "Corporation", "Co.", "Inc.", "Incorporated", "Corp."]:
    df_att["firm"] = df_att["firm"].str.replace(extra, "")
    names["name"] = names["name"].str.replace(extra, "")
    
df_att["firm"] = df_att["firm"].str.strip(",. ")
names["name"] = names["name"].str.strip(",. ")

# names.drop(columns=["case_number", "party_row_count"], inplace=True)

In [None]:
# merge name and df on party/firm to create a dataset of inhouse attorneys
df_inhouse = df_att.merge(names, left_on=["firm", "case_row_id"], right_on=["name","case_row_id"], how="inner")

list(df_inhouse)

In [None]:
# collapse dataset by attorney, party_row_count
df_inhouse = df_inhouse.groupby(["name_attorney", "party_row_count", "case_row_id"]).agg("first")

df_inhouse.reset_index(inplace=True)
# df_inhouse

In [None]:
# df_inhouse[df_inhouse["gender"].isna()]

In [None]:
# df_inhouse[["name_attorney", "firm", "name", "case_name"]].sample(n=100)
df_inhouse.to_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/inhouse_attorneys.csv",index = False)
df_att.to_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/attorneys_merged.csv",index = False)
df_cases.to_csv("C://Users/schwa/OneDrive/Desktop/School/ECO225/Data/cases_merged.csv",index = False)

## Creating summary statistics and figures

In [None]:
df_inhouse.describe().apply(lambda s: s.apply('{0:.5f}'.format))

In [None]:
### figures/summary stats for understanding the dataset

# general summary statistics for numeric variables
summary_numeric = grouped_cases[["year_filed", "demand_num"]].describe()
print(summary_numeric)

# create crosstab of jury demand
freq_jury = grouped_cases["jury_demand"].value_counts()
print(freq_jury)
                        
# create a histogram of filing over time
plt.hist(m_cases["year_filed"], bins=30, color='skyblue', edgecolor='black')
# Adding labels and title
plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Distribution of case filing years') 
# Display the plot
plt.show()

# create a bar chart of the distribution of cases across year by most filed in states
most_filed_states_id = m_cases["court_state"].value_counts().nlargest(5).index
most_filed_states = m_cases.loc[m_cases["court_state"].isin(most_filed_states_id), :]
list(most_filed_states["court_state"].unique())
sns.set()
ax = sns.countplot(data=most_filed_states, x='post_99', hue='court_state')
ax.set_xlabel('Pre versus post 1999') 
plt.tight_layout()
plt.show()

In [None]:
df.describe()

In [None]:
# histogram of the number of documents filed per case
df_hist = df[df["number_docs"] < df["number_docs"].quantile(0.95)]

fig, ax = plt.subplots()
df_hist.plot(
    kind="hist", y="number_docs",
    bins=20, legend=False, density=True, ax=ax, edgecolor = "black"
)
ax.set_facecolor((0.96, 0.96, 0.96))
fig.set_facecolor((0.96, 0.96, 0.96))
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_title("Distribution of documents filed per US patent case, 1999-2015")
plt.savefig("C:/Users/schwa/OneDrive/Desktop/School/ECO225/Results/histogram_doc_filings.png", dpi=300, format="png")

In [None]:
### bar chart of % of lawyers who are women by party type

# collapse dataset over year ranges
df_year = df.drop(df.iloc[:, 0:14], axis=1)
df_year = df_year.drop(["date_filed",	"date_closed", "date_last_filed"], axis=1)
df_year.loc[df_year["year_filed"] <= 2005, "year_range"] = "1999-2005"
df_year.loc[(2006 <= df_year["year_filed"]) & (df_year["year_filed"] <= 2010), "year_range"] = "2006-2010"
df_year.loc[df_year["year_filed"] >= 2011, "year_range"] = "2011-2015"
df_year = df_year.groupby("year_range").agg("sum")
df_year = df_year.reset_index()

# create variables with proportion of female attorneys for each party type
df_year["percent_f"] = 100 * df_year["female"] / (df_year["female"] + df_year["male"])
df_year["percent_pl_f"] = 100 * df_year["plaintiff_F"] / (df_year["plaintiff_F"] + df_year["plaintiff_M"])
df_year["percent_def_f"] = 100 * df_year["defendant_F"] / (df_year["defendant_F"] + df_year["defendant_M"])
# df_year["percent_other_f"] = 100 * df_year["other_F"] / (df_year["other_F"] + df_year["other_M"])

df_year = df_year.drop(df_year.iloc[:, 1:28], axis=1)

# df_year.plot(x="year_range", kind="bar", stacked=False)

In [None]:
Row_list =[]

# iterate over each row
for index, rows in df_year.iterrows():
    # Create list for the current row
    my_list =[rows.percent_f, rows.percent_def_f, rows.percent_pl_f]
    
    # append the list to the final list
    Row_list.append(my_list)

print(Row_list)
df_year

In [None]:
labels = ["All", "Plaintiff", "Defendant"]

Row_list =[]

# iterate over each row
for index, rows in df_year.iterrows():
    # Create list for the current row
    my_list =[rows.percent_f, rows.percent_def_f, rows.percent_pl_f]
    # append the list to the final list
    Row_list.append(my_list)

range_1 = Row_list[0]
range_2 = Row_list[1]
range_3 = Row_list[2]

fig, ax = plt.subplots()

index = np.arange(3)
width = 0.2

rects1 = ax.bar(
    index, range_1, width, label='All', edgecolor="black"
)

rects2 = ax.bar(
    index + width, range_2, width, label='Plaintiff', edgecolor="black"
)

rects3 = ax.bar(
    index + width*2, range_3, width, label='Defendant', edgecolor="black"
)

ax.set_xlabel('Year')
ax.set_ylabel('Percent of female attorneys')
ax.set_title('Gender make-up of patent attorneys litigating in US district courts, 1999-2015')
ax.set_xticks(index + width )
ax.set_xticklabels(('1999-2005', '2006-2010', '2011-2015'))
ax.legend()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

fig.tight_layout()
plt.savefig("C:/Users/schwa/OneDrive/Desktop/School/ECO225/Results/bar_attorney_gender.png", dpi=300, format="png")

In [None]:
### relationship between the proportion of court filings and the proportion of lawyers in a court who a women

df_court = df.drop(["date_filed", "date_closed", "date_last_filed"], axis=1)
df_court.loc[df_court["year_filed"] <= 2005, "year_range"] = "1999-2005"
df_court.loc[(2006 <= df_court["year_filed"]) & (df_court["year_filed"] <= 2010), "year_range"] = "2006-2010"
df_court.loc[df_court["year_filed"] >= 2011, "year_range"] = "2011-2015"
df_court = df_court.groupby(["court_state", "court_region", "year_range"]).agg("sum")
df_court = df_court.reset_index()

# create variable showing proportion of case filings made in year range in a given court
df_court.loc[df_court["year_range"] == "1999-2005", "total_filings"] = df_court[df_court["year_range"] == "1999-2005"]["post_99"].sum(axis=0)
df_court.loc[df_court["year_range"] == "2006-2010", "total_filings"] = df_court[df_court["year_range"] == "2006-2010"]["post_99"].sum(axis=0)
df_court.loc[df_court["year_range"] == "2011-2015", "total_filings"] = df_court[df_court["year_range"] == "2011-2015"]["post_99"].sum(axis=0)
df_court["percent_filings"] = 100* df_court["post_99"] / df_court["total_filings"] 

df_court["percent_female"] = 100 * df_court["female"] / (df_court["female"] + df_court["male"])

df_court.head(10)
df_court.loc[df_court["percent_female"].isna(), "percent_female"] = 0

In [None]:
def single_scatter_plot(df, year, ax):
    """
    This function creates a single year's percent filings to percent female plot
    """
    # Filter data to keep only the data of interest
    _df = df_court[df_court['year_range'] == year]
    _df.plot(
        kind="scatter", x="percent_filings", y="percent_female", ax=ax
    )

    lr = LinearRegression()
    X = _df["percent_filings"].values.reshape(-1, 1)# you can feed -1 as one of the values in reshape and 
                                                # let numpy figure out that dimension
    y = _df["percent_female"].values.reshape(-1, 1) # the output is an array
    lr.fit(X, y)

    x = np.linspace(2.0, 12.0).reshape(-1, 1)
    y_pred = lr.predict(x)
    ax.plot(x, y_pred)

    return ax

# Create initial plot
fig, ax = plt.subplots(1, 3, figsize=(16, 6))

for (i, year) in enumerate(df_court.year_range.unique()):
    single_scatter_plot(df, year, ax[i])
    ax[i].set_title(str(year))

bgcolor = (250/255, 250/255, 250/255)
fig.set_facecolor(bgcolor)
for (i, _ax) in enumerate(ax):
    # Label with words
    if i == 0:
        _ax.set_xlabel("Percent of all patent cases filed in the district")
    else:
        _ax.set_xlabel("")

    # Turn off right and top axis lines
    _ax.spines['right'].set_visible(False)
    _ax.spines['top'].set_visible(False)

    # Don't use such a white background color
    _ax.set_facecolor(bgcolor)

    # Change bounds
    _ax.set_ylim((0, 50))
    _ax.set_xlim((-0.2, 20))
 
    # Change ticks
    # xticks = [10, 100, 1000, 10000]
    # _ax.set_xticks([np.log(xi) for xi in xticks])
    # _ax.set_xticklabels([str(xi) for xi in xticks])

    # yticks = list(range(5, 32, 5))
    # _ax.set_yticks([np.log(yi) for yi in yticks])
    if i == 0:
        # _ax.set_yticklabels([str(yi) for yi in yticks])
        _ax.set_ylabel("Percent female attorneys listed on cases filed")
    else:
        # _ax.set_yticklabels([])
        _ax.set_ylabel("")

ax[0].set_zorder(1)
fig.suptitle("US federal courts' patent filings and the gender make-up of attorneys")
plt.savefig("C:/Users/schwa/OneDrive/Desktop/School/ECO225/Results/scatter_filings_gender.png", dpi=300, format="png")

In [None]:
# percent filings in top 10 courts by year

df_top = df.drop(["date_filed", "date_closed", "date_last_filed"], axis=1)
df_top = df_top.groupby(["court_state", "court_region", "year_filed"]).agg("sum")
df_top = df_top.reset_index()
df_top = df_top.drop(df_top.iloc[:, 3:22], axis=1)

# create variable showing proportion of case filings made in year range in a given court
for year in range(1999, 2016):
    df_top.loc[df_top["year_filed"] == year, "total_filings"] = df_top[df_top["year_filed"] == year]["post_99"].sum(axis=0)

# create new dataset with top 10 courts, bottom 10 courts
df_top_years = df_top[df_top["year_filed"] == 1999].nlargest(10, "post_99")
for year in range(2000, 2016):
    _df = df_top[df_top["year_filed"] == year].nlargest(10, "post_99")
    df_top_years = pd.concat([df_top_years, _df], axis = 0)
df_top_years["top10"] = 1

for year in range(1999, 2016):
    _df = df_top[df_top["year_filed"] == year].nsmallest(84, "post_99")
    df_top_years = pd.concat([df_top_years, _df], axis = 0)
df_top_years = df_top_years.fillna(0)

df_top_years = df_top_years.drop(df_top_years.iloc[:, 4:23], axis=1)
df_top_years = df_top_years.drop(df_top_years.iloc[:, 0:2], axis=1)
df_top_years["year_filed"] = df_top_years["year_filed"].astype(int)
df_top_years = df_top_years.groupby(["top10", "year_filed"]).agg({"post_99":"sum", "total_filings": "max"})

df_top_years["percent_filings"] = 100* df_top_years["post_99"] / df_top_years["total_filings"]

# df_top_years

In [None]:
df_top_years = df_top_years.drop(["post_99", "total_filings"], axis=1)
df_top_years = df_top_years.reset_index(level = "top10")
df_top_years = df_top_years[df_top_years["top10"] == 1]
df_top_years = df_top_years.drop("top10", axis=1)

fig, ax = plt.subplots()

df_top_years["percent_filings"].plot(kind="bar", ax=ax, color="#1b42fc")
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_xlabel("Year")
ax.set_ylabel("Percent filings in 10 most filed in district courts")
ax.set_title("Patent cases filed in most filed in US district circuits, 1999-2015")
plt.savefig("C:/Users/schwa/OneDrive/Desktop/School/ECO225/Results/bar_top10.png", dpi=200, format="png", bbox_inches="tight")