In [None]:
import snorkel
import os
os.environ["WORKING_DIR"] = "D:\\semantic_data_lake\\semantic_data_lake"
os.environ["TYPENAME"] = "type_turl"
import sys
sys.path.append(os.environ["WORKING_DIR"])
from os.path import join
import json
import pandas as pd
from sklearn.preprocessing import LabelEncoder
pd.__version__

In [None]:
import tensorflow
tensorflow.__version__

In [None]:
%run combine_LFs_labels.py --corpus turl

In [None]:
from knowledge_graph.google_knowledge_graph import get_googleKG_types_for_series, get_googleKG_types_for_entity

get_googleKG_types_for_entity("Lothar Matthäus")

In [None]:
# load data to label
corpus = "turl"
labeled_data_size = 1
unlabeled_data_size = "absolute"
test_data_size = 20.0#
absolute_numbers = True
TYPENAME = "type_turl"
valid_headers_file = f"{corpus}_{TYPENAME}_valid.json"
os.environ["TURL_DIR"] = "D:\\TURL\\tables"

# LabelEncoder
with open(join(os.environ["WORKING_DIR"], "data", "extract", "out", "valid_types", "types.json")) as f:
    valid_types = json.load(f)[TYPENAME]

label_enc = LabelEncoder()
label_enc.fit(valid_types)

labeled_unlabeled_test_split_path = join(os.environ["WORKING_DIR"], "data",
                                         "extract", "out",
                                         "labeled_unlabeled_test_split")

# load unlabeled data from labeled, unlabeled, test split file und use it for generating new training data
with open(
        join(
            labeled_unlabeled_test_split_path,
            f"{corpus}_{labeled_data_size}_{unlabeled_data_size}_{test_data_size}.json"
        )) as f:
    labeled_unlabeled_test_split_file = json.load(f)
    if absolute_numbers:
        unlabeled_data_ids = labeled_unlabeled_test_split_file[
            f"unlabeled"]
    else:
        unlabeled_data_ids = labeled_unlabeled_test_split_file[
            f"unlabeled{unlabeled_data_size}"]

print(f"Unlabeled data to label: {len(unlabeled_data_ids)}")

# load the valid headers with real sem. types
valid_headers_path = join(os.environ["WORKING_DIR"], "data", "extract", "out",
                          "valid_headers")
valid_headers = join(valid_headers_path, valid_headers_file)
with open(valid_headers, "r") as file:
    valid_headers = json.load(file)
# transform valid header into df to make it joinable with word embeddings
valid_header_df_data = []
for table in valid_headers.keys():
    for column in valid_headers[table].keys():
        valid_header_df_data.append([
            table, column, table + "+" + column,
            valid_headers[table][column]["semanticType"]
        ])
valid_header_df = pd.DataFrame(
    valid_header_df_data,
    columns=["table", "column", "dataset_id", "semanticType"])

# filter out unlabeled data from valid_headers
unlabeled_data_df = valid_header_df.loc[valid_header_df["dataset_id"].isin(unlabeled_data_ids)]

#unlabeled_data_df = unlabeled_data_df.sample(n=50)

In [None]:
# Define the labels
ABSTAIN = -1
# other labels are defined via valid types

In [None]:
unlabeled_data_df.head(10)

## LF: cluster_n_classify

In [None]:
### LF: cluster_n_classify

from snorkel.labeling import labeling_function

distance_threshold = 1e-4

### load generated training data from cluster_n_classify
gen_train_data_path = join(os.environ["WORKING_DIR"], "emb_clus", "without_knn", "out", "gen_training_data")
gen_train_data_file = f"{corpus}_gen_training_data_{distance_threshold}_{labeled_data_size}_{unlabeled_data_size}_{test_data_size}.csv"
gen_train_data_df = pd.read_csv(join(gen_train_data_path, gen_train_data_file))

@labeling_function()
def cluster_n_classify(x):
    if len(gen_train_data_df[gen_train_data_df["dataset_id"] == x["dataset_id"]]) > 0:
        assert len(gen_train_data_df[gen_train_data_df["dataset_id"] == x["dataset_id"]]) == 1
        #print(x["dataset_id"])
        df_to_label = gen_train_data_df[gen_train_data_df["dataset_id"] == x["dataset_id"]]
        #print(gen_train_data_df[gen_train_data_df["dataset_id"] == x["dataset_id"]])
        LABEL = label_enc.transform(df_to_label["predicted_semantic_type"])[0]
        #print(label_enc.transform(df_to_label["predicted_semantic_type"])[0])
        return LABEL
    else:
        return ABSTAIN

In [None]:
#### Apply LFs
import warnings
warnings.filterwarnings('ignore')
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import PandasParallelLFApplier

lfs = [cluster_n_classify]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=unlabeled_data_df)

In [None]:
### load google use v5
import tensorflow_hub as hub
# embed = hub.load(
#     join(os.environ["WORKING_DIR"], "emb_clus", "word_embedding", "models",
#          "google_use_3"))

embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

df = pd.DataFrame({"valid_types":valid_types})
df["header"] = "organization"
df["cosine_sim"] = df.apply(lambda row: cosine_similarity(embed([row["valid_types"].split(".")[1]]), embed([row["header"]]))[0][0], axis=1)

In [None]:
cosine_similarity(embed(["Atlanta Falcons"]), embed(["Washington Redskins"]))

In [None]:
df.sort_values(by="cosine_sim", ascending=False)

In [None]:
" ".join("test.test".split("."))

## LF: GKG

In [None]:
### LF: Google Knowledge Graph
from snorkel.labeling import labeling_function
from snorkel.preprocess import preprocessor
from knowledge_graph.google_knowledge_graph import get_googleKG_types_for_series, get_googleKG_types_for_entity
import copy
from sklearn.metrics.pairwise import cosine_similarity

### table colum loader of raw data
def load_tablecolumn(dataset_id:str):
    table_id = dataset_id.split("+")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    df_column = pd.read_csv(join(os.environ["TURL_DIR"],table_id), usecols=[int(column_id)]).sample(n=5,replace=True, random_state=42)
    return df_column.iloc[:,0].values.tolist()


df_valid_type_WE = pd.DataFrame({"valid_types": valid_types})
#df_valid_type_WE["valid_types_WE"] = df_valid_type_WE.apply(lambda row: embed([" ".join(row["valid_types"].split("."))]), axis=1)
df_valid_type_WE["valid_types_WE"] = df_valid_type_WE.apply(lambda row: embed([row["valid_types"].split(".")[1]]), axis=1)

def sim_score_GKG_valid_types(GKG_types:list):
    results = []
    GKG_types = embed(GKG_types)
    
    for GKG_type in GKG_types:
        df_compare = copy.copy(df_valid_type_WE)
        df_compare["GKG_type"] = [[GKG_type] for x in df_compare["valid_types_WE"]]
        df_compare["cosine_sim"] = df_compare.apply(lambda row: cosine_similarity(row["valid_types_WE"], row["GKG_type"])[0][0], axis=1)
        df_compare = df_compare.sort_values(by="cosine_sim", ascending=False)
        results.append([df_compare.iloc[0]["valid_types"],df_compare.iloc[0]["cosine_sim"]])
    
    result = pd.DataFrame(results, columns=["semantic_type", "cosine_sim"]).sort_values(by="cosine_sim", ascending=False)
    if len(result) < 1:
        return ABSTAIN
    if result.iloc[0]["cosine_sim"] >= 0.9:
        return result.iloc[0]["semantic_type"]
    else: 
        return ABSTAIN

@labeling_function()
def googleKG(x):
    df_GKG_result = get_googleKG_types_for_series(load_tablecolumn(x["dataset_id"]))
    if len(df_GKG_result) >3:
        df_GKG_result = df_GKG_result[0:3]
    #print(df_GKG_result)
    label = sim_score_GKG_valid_types(df_GKG_result["type"].tolist())
    #print(sim_score_GKG_valid_types(df_GKG_result["type"].tolist()))
    if label != ABSTAIN:
        LABEL = label_enc.transform([label])[0]
        return LABEL
    return ABSTAIN

In [None]:
unlabeled_data_df.head(10)

In [None]:
load_tablecolumn("126381_16635546-12.csv+column_0")

In [None]:
get_googleKG_types_for_series(load_tablecolumn("126381_16635546-12.csv+column_0"))

## LF: DBpedia

In [None]:
### LF: DBpedia Lookup
from snorkel.labeling import labeling_function
from snorkel.preprocess import preprocessor
from knowledge_graph.dbpedia import get_dbpedia_types_for_series
import copy
from sklearn.metrics.pairwise import cosine_similarity

### table colum loader of raw data
def load_tablecolumn(dataset_id:str):
    table_id = dataset_id.split("+")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    df_column = pd.read_csv(join(os.environ["TURL_DIR"],table_id), usecols=[int(column_id)]).sample(n=5,replace=True, random_state=42)
    return df_column.iloc[:,0].values.tolist()


df_valid_type_WE = pd.DataFrame({"valid_types": valid_types})
#df_valid_type_WE["valid_types_WE"] = df_valid_type_WE.apply(lambda row: embed([" ".join(row["valid_types"].split("."))]), axis=1)
df_valid_type_WE["valid_types_WE"] = df_valid_type_WE.apply(lambda row: embed([row["valid_types"].split(".")[1]]), axis=1)

def sim_score_DBpedia_valid_types(GKG_types:list):
    results = []
    GKG_types = embed(GKG_types)
    
    for GKG_type in GKG_types:
        df_compare = copy.copy(df_valid_type_WE)
        df_compare["GKG_type"] = [[GKG_type] for x in df_compare["valid_types_WE"]]
        df_compare["cosine_sim"] = df_compare.apply(lambda row: cosine_similarity(row["valid_types_WE"], row["GKG_type"])[0][0], axis=1)
        df_compare = df_compare.sort_values(by="cosine_sim", ascending=False)
        results.append([df_compare.iloc[0]["valid_types"],df_compare.iloc[0]["cosine_sim"]])
    
    result = pd.DataFrame(results, columns=["semantic_type", "cosine_sim"]).sort_values(by="cosine_sim", ascending=False)
    if len(result) < 1:
        return ABSTAIN
    if result.iloc[0]["cosine_sim"] >= 0.9:
        return result.iloc[0]["semantic_type"]
    else: 
        return ABSTAIN

@labeling_function()
def dbpedia_lookup(x):
    df_GKG_result = get_dbpedia_types_for_series(load_tablecolumn(x["dataset_id"]))
    if len(df_GKG_result) >3:
        df_GKG_result = df_GKG_result[0:3]
    #print(df_GKG_result)
    label = sim_score_DBpedia_valid_types(df_GKG_result["type"].tolist())
    #print(sim_score_GKG_valid_types(df_GKG_result["type"].tolist()))
    if label != ABSTAIN:
        LABEL = label_enc.transform([label])[0]
        return LABEL
    return ABSTAIN

In [None]:
get_dbpedia_types_for_series(load_tablecolumn("46571_31788759-5.csv+column_0"))

In [None]:
get_googleKG_types_for_entity("Berlin", numberOfElements=1)

In [None]:
#### Apply LFs
import warnings
warnings.filterwarnings('ignore')
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import PandasParallelLFApplier

#lfs = [cluster_n_classify]
lfs = [cluster_n_classify, googleKG, dbpedia_lookup, header_emebedding_similarity]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=unlabeled_data_df)

In [None]:
## Analys the LFs
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

In [None]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter(cardinality=len(valid_types))
preds_train = majority_model.predict(L=L_train)

from sklearn.metrics import classification_report

print(f"Length of labeled data: {len([x for x in preds_train if x != -1])}")

unlabeled_data_df["L_train"] = preds_train
class_reportable_data = unlabeled_data_df.drop(unlabeled_data_df[unlabeled_data_df["L_train"] == -1].index)

 
print(classification_report(
    label_enc.transform(class_reportable_data["semanticType"]), class_reportable_data["L_train"]))

In [None]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=len(valid_types), verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

predicted_L_train = label_model.predict(L_train)

In [None]:
from sklearn.metrics import classification_report

print(f"Length of labeled data: {len([x for x in predicted_L_train if x != -1])}")

unlabeled_data_df["L_train"] = predicted_L_train
class_reportable_data = unlabeled_data_df.drop(unlabeled_data_df[unlabeled_data_df["L_train"] == -1].index)

 
print(classification_report(
    label_enc.transform(class_reportable_data["semanticType"]), class_reportable_data["L_train"]))

In [None]:
#### Apply LFs
import warnings
warnings.filterwarnings('ignore')
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import PandasParallelLFApplier

#lfs = [cluster_n_classify]
lfs = [dbpedia_lookup]

applier = PandasLFApplier(lfs=lfs)

from multiprocessing import  Pool
from multiprocessing.pool import ThreadPool as Pool
from functools import partial
import numpy as np
from tqdm.auto import tqdm

def parallelize(data, func, num_of_processes=8):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    #data = pd.concat(pool.map(func, data_split))
    data = np.concatenate(pool.map(func, data_split), axis=0)
    pool.close()
    pool.join()
    return data

def run_on_subset(func, data_subset):
    return data_subset.apply(func, axis=1)

def parallelize_on_rows(data, func, num_of_processes=8):
    return parallelize(data, partial(run_on_subset, func), num_of_processes)

L_train = parallelize(unlabeled_data_df,applier.apply, 6)

In [None]:
len([x for x in L_train if x == -1])

In [None]:
from sklearn.metrics import classification_report

print(classification_report(
    label_enc.transform(unlabeled_data_df["semanticType"]), L_train))


In [None]:
len(L_train)

In [None]:
## Analys the LFs
from snorkel.labeling import LFAnalysis

LFAnalysis(L=L_train, lfs=lfs).lf_summary()

## LF: WE similartity Column-Header <-> sem. Types

In [None]:
### LF: DBpedia Lookup
from snorkel.labeling import labeling_function
from snorkel.preprocess import preprocessor
from knowledge_graph.dbpedia import get_dbpedia_types_for_series
import copy
from sklearn.metrics.pairwise import cosine_similarity

### table colum loader of raw data
def load_tablecolumn_header(dataset_id:str):
    table_id = dataset_id.split("+")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    header = pd.read_csv(join("D:\\TURL","tables_with_headers",table_id), usecols=[int(column_id)]).columns[0]
    return header

df_valid_type_WE = pd.DataFrame({"valid_types": valid_types})
#df_valid_type_WE["valid_types_WE"] = df_valid_type_WE.apply(lambda row: embed([" ".join(row["valid_types"].split("."))]), axis=1)
df_valid_type_WE["valid_types_WE"] = df_valid_type_WE.apply(lambda row: embed([row["valid_types"].split(".")[1]]), axis=1)



@labeling_function()
def header_emebedding_similarity(x):
    header = load_tablecolumn_header(x["dataset_id"])
    df_compare = copy.copy(df_valid_type_WE)
    header_embedded = embed([header])
    df_compare["cosine_sim"] = df_compare.apply(lambda row: cosine_similarity(row["valid_types_WE"], header_embedded)[0][0], axis=1)
    df_compare = df_compare.sort_values(by="cosine_sim", ascending=False)
    if df_compare.iloc[0]["cosine_sim"] >= 0.9:
        LABEL = label_enc.transform([df_compare.iloc[0]["valid_types"]])[0]
        return LABEL
    else:
        return ABSTAIN
    

In [None]:
#### Apply LFs
import warnings
warnings.filterwarnings('ignore')
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import PandasParallelLFApplier

#lfs = [cluster_n_classify]
lfs = [header_emebedding_similarity]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=unlabeled_data_df)

In [None]:
from sklearn.metrics import classification_report

print(f"Length of labeled data: {len([x for x in L_train if x != -1])}")

unlabeled_data_df["L_train"] = L_train
class_reportable_data = unlabeled_data_df.drop(unlabeled_data_df[unlabeled_data_df["L_train"] == -1].index)

 
print(classification_report(
    label_enc.transform(class_reportable_data["semanticType"]), class_reportable_data["L_train"]))

# LF: Different Domain Expert LFS

In [None]:
list_of_nfl_teams = [
    "atlanta falcons", "baltimore ravens", "buffalo bills",
    "carolina panthers", "chicago bears", "cincinnati bengals",
    "cleveland browns", "dallas cowboys", "denver broncos", "detroit lions",
    "green bay packers", "houston texans", "indianapolis colts",
    "jacksonville jaguars", "kansas city chiefs", "las vegas raiders",
    "los angeles chargers", "los angeles rams", "miami dolphins",
    "minnesota vikings", "new england patriots", "new orleans saints",
    "new york giants", "new york jets", "philadelphia eagles",
    "pittsburgh steelers", "san francisco 49ers", "seattle seahawks",
    "tampa bay buccaneers", "tennessee titans", "washington football team"
]

list_of_nfl_teams_short = [
    'falcons', 'ravens', 'bills', 'panthers', 'bears', 'bengals', 'browns',
    'cowboys', 'broncos', 'lions', 'packers', 'texans', 'colts', 'jaguars',
    'chiefs', 'raiders', 'chargers', 'rams', 'dolphins', 'vikings', 'patriots',
    'saints', 'giants', 'jets', 'eagles', 'steelers', '49ers', 'seahawks',
    'buccaneers', 'titans'
]

In [None]:
from snorkel.labeling import labeling_function

### table colum loader of raw data
def load_tablecolumn(dataset_id:str):
    table_id = dataset_id.split("+")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    df_column = pd.read_csv(join(os.environ["TURL_DIR"],table_id), usecols=[int(column_id)])
    return df_column.iloc[:,0].values.tolist()

@labeling_function()
def american_football_team(x):
    current_col = load_tablecolumn(x["dataset_id"])
    min_num = int(0.2 * len(current_col))
    if min_num <= 1: min_num = 2
    num_of_nfl_teams_in_col = len([x for x in load_tablecolumn(x["dataset_id"]) if str(x).lower() in list_of_nfl_teams])
    if num_of_nfl_teams_in_col >= min_num:
        LABEL = label_enc.transform(["american_football.football_team"])[0]
        return LABEL
    return ABSTAIN

In [None]:
unlabeled_data_df_with_type =  unlabeled_data_df[unlabeled_data_df["semanticType"] == "american_football.football_team"]
unlabeled_data_df_sampled = unlabeled_data_df.sample(n=100)

In [None]:
#### Apply LFs
import warnings
warnings.filterwarnings('ignore')
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import PandasParallelLFApplier

lfs = [american_football_team, baseball_team]

applier = PandasLFApplier(lfs=lfs)
L_train_1 = applier.apply(df=unlabeled_data_df_sampled)

In [None]:
import numpy as np

l1 = L_train.transpose()[0].tolist()
l2 = L_train.transpose()[1].tolist()
new_L_train = []
new_L_train.append(l1)
new_L_train.append(l2)

new_L_train = np.array(new_L_train).transpose()

In [None]:
print(L_train.shape)
print(new_L_train.shape)

In [None]:
len([x for x in L_train if x == label_enc.transform(["american_football.football_team"])[0]])

In [None]:
from sklearn.metrics import classification_report

print(f"Length of labeled data: {len([x for x in L_train if x != -1])}")

unlabeled_data_df_sampled["L_train"] = L_train
class_reportable_data = unlabeled_data_df_sampled.drop(unlabeled_data_df_sampled[unlabeled_data_df_sampled["L_train"] == -1].index)

 
print(classification_report(
    class_reportable_data["semanticType"], label_enc.inverse_transform(class_reportable_data["L_train"])))

In [None]:
unlabeled_data_df_sampled[(unlabeled_data_df_sampled["semanticType"] == "time.event") & (unlabeled_data_df_sampled["L_train"] != -1)].head(50)

In [None]:
# baseball.baseball_teams
mlb_baseball_teams = [
    'atlanta braves', 'miami marlins', 'new york mets',
    'philadelphia phillies', 'washington nationals', 'chicago cubs',
    'cincinnati reds', 'milwaukee brewers', 'pittsburgh pirates',
    'st. louis cardinals', 'arizona diamondbacks', 'colorado rockies',
    'los angeles dodgers', 'san diego padres', 'san francisco giants',
    'baltimore orioles', 'boston red sox', 'new york yankees',
    'tampa bay rays', 'toronto blue jays', 'chicago white sox',
    'cleveland indians', 'detroit tigers', 'kansas city royals',
    'minnesota twins', 'houston astros', 'los angeles angels',
    'oakland athletics', 'seattle mariners', 'texas rangers'
]

from snorkel.labeling import labeling_function

### table colum loader of raw data
def load_tablecolumn(dataset_id:str):
    table_id = dataset_id.split("+")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    df_column = pd.read_csv(join(os.environ["TURL_DIR"],table_id), usecols=[int(column_id)])
    return df_column.iloc[:,0].values.tolist()

@labeling_function()
def baseball_team(x):
    current_col = load_tablecolumn(x["dataset_id"])
    min_num = int(0.2 * len(current_col))
    if min_num <= 1: min_num = 2
    num_of_nfl_teams_in_col = len([x for x in load_tablecolumn(x["dataset_id"]) if str(x).lower() in mlb_baseball_teams])
    if num_of_nfl_teams_in_col >= min_num:
        LABEL = label_enc.transform(["baseball.baseball_team"])[0]
        return LABEL
    return ABSTAIN

In [None]:
unlabeled_data_df_with_type =  unlabeled_data_df[unlabeled_data_df["semanticType"] == "baseball.baseball_team"]
print(len(unlabeled_data_df_with_type))
#unlabeled_data_df_sampled = unlabeled_data_df.sample(n=100000)

In [None]:
#### Apply LFs
import warnings
warnings.filterwarnings('ignore')
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import PandasParallelLFApplier

lfs = [baseball_team]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=unlabeled_data_df_sampled)

In [None]:
len([x for x in L_train if x == label_enc.transform(["baseball.baseball_team"])[0]])

In [None]:
from sklearn.metrics import classification_report

print(f"Length of labeled data: {len([x for x in L_train if x != -1])}")

unlabeled_data_df_sampled["L_train"] = L_train
class_reportable_data = unlabeled_data_df_sampled.drop(unlabeled_data_df_sampled[unlabeled_data_df_sampled["L_train"] == -1].index)

 
print(classification_report(
    class_reportable_data["semanticType"], label_enc.inverse_transform(class_reportable_data["L_train"])))

In [None]:
# film.film_genre

list_of_film_genre = [
    "drama", "science fiction", "sci-fi", "romance", "action", "war", "horror",
    "crime", "comedy", "comedy drama", "family film", "biography", "documentary film", "thriller"
]

from snorkel.labeling import labeling_function

### table colum loader of raw data
def load_tablecolumn(dataset_id:str):
    table_id = dataset_id.split("+")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    df_column = pd.read_csv(join(os.environ["TURL_DIR"],table_id), usecols=[int(column_id)])
    return df_column.iloc[:,0].values.tolist()

@labeling_function()
def film_genre(x):
    current_col = load_tablecolumn(x["dataset_id"])
    min_num = int(0.2 * len(current_col))
    if min_num <= 1: min_num = 2
    num_of_nfl_teams_in_col = len([x for x in load_tablecolumn(x["dataset_id"]) if str(x).lower() in list_of_film_genre])
    if num_of_nfl_teams_in_col >= min_num:
        LABEL = label_enc.transform(["film.film_genre"])[0]
        return LABEL
    return ABSTAIN

In [None]:
unlabeled_data_df_with_type =  unlabeled_data_df[unlabeled_data_df["semanticType"] == "film.film_genre"]
print(len(unlabeled_data_df_with_type))

In [None]:
#### Apply LFs
import warnings
warnings.filterwarnings('ignore')
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.apply.dask import PandasParallelLFApplier

lfs = [film_genre]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=unlabeled_data_df_sampled)

In [None]:
print(len([x for x in L_train if x == label_enc.transform(["film.film_genre"])[0]]))
print(len([x for x in L_train if x == -1]))

In [None]:
from sklearn.metrics import classification_report

print(f"Length of labeled data: {len([x for x in L_train if x != -1])}")

unlabeled_data_df_sampled["L_train"] = L_train
class_reportable_data = unlabeled_data_df_sampled.drop(unlabeled_data_df_sampled[unlabeled_data_df_sampled["L_train"] == -1].index)

 
print(classification_report(
    class_reportable_data["semanticType"], label_enc.inverse_transform(class_reportable_data["L_train"])))

In [None]:
# ice_hockey_teams
ice_hockes_teams = [
    'anaheim ducks', 'arizona coyotes', 'boston bruins', 'buffalo sabres',
    'calgary flames', 'carolina hurricanes', 'chicago blackhawks',
    'colorado avalanche', 'columbus blue jackets', 'dallas stars',
    'detroit red wings', 'edmonton oilers', 'florida panthers',
    'los angeles kings', 'minnesota wild', 'montreal canadiens',
    'nashville predators', 'new jersey devils', 'new york islanders',
    'new york rangers', 'ottawa senators', 'philadelphia flyers',
    'pittsburgh penguins', 'san jose sharks', 'seattle kraken',
    'st louis blues', 'tampa bay lightning', 'toronto maple leafs',
    'vancouver canucks', 'vegas golden knights', 'washington capitals',
    'winnipeg jets'
]


In [None]:
# soccer.football_team
soccer_football_teams = [
    'werder bremen', 'bayer 04 leverkusen', 'hamburger sv', 'fc schalke 04',
    'hertha bsc berlin', 'borussia dortmund', 'vfl wolfsburg', 'hannover 96',
    'vfb stuttgart', 'borussia m`gladbach', 'fc augsburg',
    'spvgg greuther fürth', 'tsg hoffenheim', 'sc paderborn 07', '1. fc köln',
    'eintracht frankfurt', 'sc freiburg', 'fc bayern münchen',
    'tsv alemannia aachen', '1.fsv mainz 05', '1.fc kaiserslautern',
    'vfl bochum', 'fc energie cottbus', 'dsc arminia bielefeld',
    'tsv 1860 münchen', 'kickers offenbach', 'fc erzgebirge aue',
    'tus koblenz', '1. fc nürnberg', 'karlsruher sc', 'fc hansa rostock',
    'msv duisburg', 'fc carl zeiss jena', 'sv wehen', 'fc st. pauli',
    'vfl osnabrück', 'accrington stanley', 'afc telford united',
    'aldershot town', 'fc altrincham', 'aston villa', 'fc barnsley',
    'afc barrow', 'bath city', 'birmingham city', 'birmingham city lfc',
    'blackburn rovers', 'fc blackpool', 'bolton wanderers', 'fc boreham wood',
    'boston united', 'afc bournemouth', 'bradford city',
    'bradford park avenue', 'braintree town', 'brighton & hove albion',
    'bristol city', 'bristol city women’s football club', 'bristol rovers',
    'fc burnley', 'burton albion', 'fc bury', 'cambridge united',
    'carlisle united', 'cheltenham town', 'fc chesterfield',
    'colchester united', 'coventry city', 'crawley town', 'crewe alexandra',
    'fc darwen', 'derby county', 'doncaster rovers', 'eastbourne borough',
    'ebbsfleet unite', 'fc everton', 'fc everton (frauenfußball)',
    'exeter city', 'fc chester', 'fc chorley', 'fc halifax town',
    'fleetwood town', 'forest green rovers', 'afc fylde',
    'gainsborough trinity', 'fc gillingham', 'glossop north end',
    'grimsby town', 'fc hallam', 'harrogate town', 'hartlepool united',
    'hednesford town', 'hereford united', 'fc histon', 'hucknall town',
    'huddersfield town', 'hull city', 'ipswich town', 'kettering town',
    'kidderminster harriers', 'leeds united', 'leicester city',
    'lincoln city (fußballverein)', 'fc liverpool', 'luton town',
    'maidstone united', 'manchester city', 'manchester united',
    'manchester united w.f.c.', 'mansfield town', 'fc middlesbrough',
    'milton keynes dons', 'fc morecambe', 'newcastle united',
    'northampton town', 'northwich victoria', 'norwich city',
    'nottingham forest', 'notts county', 'old carthusians f.c.',
    'old etonians', 'oldham athletic', 'ossett united', 'oswestry town',
    'oxford united', 'peterborough united', 'plymouth argyle', 'port vale',
    'fc portsmouth', 'potters bar town', 'preston north end', 'fc reading',
    'afc rochdale', 'rotherham united', 'royal engineers afc', 'salford city',
    'scunthorpe united', 'sheffield united', 'sheffield wednesday',
    'fc sheffield', 'shrewsbury town', 'solihull moors', 'fc southampton',
    'southend united', 'fc southport', 'st helens town afc',
    'stafford rangers', 'stalybridge celtic', 'fc stevenage',
    'stockport county', 'stoke city', 'afc sunderland', 'swindon town',
    'fc tamworth', 'telford united', 'torquay united', 'tranmere rovers',
    'fc united of manchester', 'fc walsall', 'fc watford',
    'west bromwich albion', 'wigan athletic', 'fc woking',
    'wolverhampton wanderers', 'worcester city', 'wycombe wanderers',
    'yeovil town', 'york city', 'real madrid', 'fc barcelona',
    'atlético madrid', 'athletic bilbao', 'fc valencia', 'fc sevilla',
    'espanyol barcelona', 'real sociedad san sebastián', 'real saragossa',
    'betis sevilla', 'celta vigo', 'deportivo la coruña', 'real valladolid',
    'sporting gijón', 'racing santander', 'ca osasuna',
    'cd málaga und fc málaga', 'real oviedo', 'ud las palmas', 'rcd mallorca',
    'fc villarreal', 'fc granada', 'fc elche', 'rayo vallecano',
    'hércules alicante'
]


In [None]:
# music genre
music_genres = [
    "disco", "hip hop", "techno", "drum'n'bass", "house", "trance", "soul",
    "r&b", "ballad", "reggae", "jazz", "jazz electro", "heavy metal",
    "hard rock", "country", "dance", "progressive rock", "pop", "pop rock", "blues", "blues rock"
]

In [None]:
# automotive.model
automotive_models = [
    'porsche carrera', 'chevrolet corvette', 'chevrolet corvette c6.r',
    'ford au falcon', 'ford ba falcon', 'vx commodore',
    'holden vz commodoreford mustang', 'chevrolet camaro', 'jaguar xkr',
    'nissan gt-r', 'nissan gt-r gt3', 'toyota celica', 'lamborghini gallardo',
    'seat león', 'mitsubishi lancer evo ix', 'škoda fabia s2000',
    'ford fiesta', 'volkswagen polo', 'opel corsa', 'renault clio',
    'peugeot 207', 'toyota yaris', 'citroen c3', 'škoda fabia', 'fiat punto',
    'seat ibiza', 'audi a1', 'hyundai i20'
]


In [None]:
# location_us_county

us_counties = [
    "jasper", "newton", "allegheny", "northampton", "huntingdon", "dauphin",
    "carbon", "wyoming", "berks", "wise", "parker", "todd", "minnehaha",
    "bexar", "gillespie", "oldham", "washoe", "windsor", "windham",
    "chittenden", "addison", "lancaster", "chester", "madison", "hinds"
]

us_counties = [
    "Abbeville", "Acadia Parish", "Accomack", "Ada", "Adair", "Adair", "Adair",
    "Adair", "Adams", "Adams", "Adams", "Adams", "Adams", "Adams", "Adams",
    "Adams", "Adams", "Adams", "Adams", "Adams", "Addison", "Adjuntas",
    "Aguada", "Aguadilla", "Aguas Buenas", "Aibonito", "Aiken", "Aitkin",
    "Alachua", "Alamance", "Alameda", "Alamosa", "Albany", "Albany",
    "Albemarle", "Alcona", "Alcorn", "Aleutians East Borough",
    "Aleutians West Census Area", "Alexander", "Alexander", "Alexandria",
    "Alfalfa", "Alger", "Allamakee", "Allegan", "Allegany", "Allegany",
    "Alleghany", "Alleghany", "Allegheny", "Allen", "Allen", "Allen", "Allen",
    "Allen Parish", "Allendale", "Alpena", "Alpine", "Amador", "Amelia",
    "Amherst", "Amite", "Anchorage", "Anchorage Municipality", "Anderson",
    "Anderson", "Anderson", "Anderson", "Anderson", "Andrew", "Andrews",
    "Androscoggin", "Angelina", "Anne Arundel", "Anoka", "Anson", "Antelope",
    "Antrim", "Apache", "Appanoose", "Appling", "Appomattox", "Aransas",
    "Arapahoe", "Archer", "Archuleta", "Arecibo", "Arenac", "Arkansas",
    "Arlington", "Armstrong", "Armstrong", "Aroostook", "Arroyo", "Arthur",
    "Ascension Parish", "Ashe", "Ashland", "Ashland", "Ashley", "Ashtabula",
    "Asotin", "Assumption Parish", "Atascosa", "Atchison", "Atchison",
    "Athens", "Atkinson", "Atlantic", "Atoka", "Attala", "Audrain", "Audubon",
    "Auglaize", "Augusta", "Aurora", "Austin", "Autauga", "Avery",
    "Avoyelles Parish", "Añasco", "Baca", "Bacon", "Bailey", "Baker", "Baker",
    "Baker", "Baker Island", "Baldwin", "Baldwin", "Ballard", "Baltimore",
    "Baltimore", "Bamberg", "Bandera", "Banks", "Banner", "Bannock", "Baraga",
    "Barber", "Barbour", "Barbour", "Barceloneta", "Barnes", "Barnstable",
    "Barnwell", "Barranquitas", "Barren", "Barron", "Barrow", "Barry", "Barry",
    "Bartholomew", "Barton", "Barton", "Bartow", "Bastrop", "Bates", "Bath",
    "Bath", "Baxter", "Bay", "Bay", "Bayamón", "Bayfield", "Baylor", "Beadle",
    "Bear Lake", "Beaufort", "Beaufort", "Beauregard Parish", "Beaver",
    "Beaver", "Beaver", "Beaverhead", "Becker", "Beckham", "Bedford",
    "Bedford", "Bedford", "Bee", "Belknap", "Bell", "Bell", "Belmont",
    "Beltrami", "Ben Hill", "Benewah", "Bennett", "Bennington", "Benson",
    "Bent", "Benton", "Benton", "Benton", "Benton", "Benton", "Benton",
    "Benton", "Benton", "Benton", "Benzie", "Bergen", "Berkeley", "Berkeley",
    "Berks", "Berkshire", "Bernalillo", "Berrien", "Berrien", "Bertie",
    "Bethel Census Area", "Bexar", "Bibb", "Bibb", "Bienville Parish",
    "Big Horn", "Big Horn", "Big Stone", "Billings", "Bingham", "Black Hawk",
    "Blackford", "Bladen", "Blaine", "Blaine", "Blaine", "Blaine", "Blair",
    "Blanco", "Bland", "Bleckley", "Bledsoe", "Blount", "Blount", "Blue Earth",
    "Boise", "Bolivar", "Bollinger", "Bon Homme", "Bond", "Bonner",
    "Bonneville", "Boone", "Boone", "Boone", "Boone", "Boone", "Boone",
    "Boone", "Boone", "Borden", "Bosque", "Bossier Parish", "Botetourt",
    "Bottineau", "Boulder", "Boundary", "Bourbon", "Bourbon", "Bowie",
    "Bowman", "Box Butte", "Box Elder", "Boyd", "Boyd", "Boyle", "Bracken",
    "Bradford", "Bradford", "Bradley", "Bradley", "Branch", "Brantley",
    "Braxton", "Brazoria", "Brazos", "Breathitt", "Breckinridge", "Bremer",
    "Brevard", "Brewster", "Briscoe", "Bristol", "Bristol Bay Borough",
    "Bristol", "Bristol", "Broadwater", "Brooke", "Brookings", "Brooklyn",
    "Brooks", "Brooks", "Broome", "Broomfield", "Broward", "Brown", "Brown",
    "Brown", "Brown", "Brown", "Brown", "Brown", "Brown", "Brown", "Brule",
    "Brunswick", "Brunswick", "Bryan", "Bryan", "Buchanan", "Buchanan",
    "Buchanan", "Buckingham", "Bucks", "Buena Vista", "Buena Vista", "Buffalo",
    "Buffalo", "Buffalo", "Bullitt", "Bulloch", "Bullock", "Buncombe",
    "Bureau", "Burke", "Burke", "Burke", "Burleigh", "Burleson", "Burlington",
    "Burnet", "Burnett", "Burt", "Butler", "Butler", "Butler", "Butler",
    "Butler", "Butler", "Butler", "Butler", "Butte", "Butte", "Butte", "Butts",
    "Cabarrus", "Cabell", "Cabo Rojo", "Cache", "Caddo", "Caddo Parish",
    "Caguas", "Calaveras", "Calcasieu Parish", "Caldwell", "Caldwell",
    "Caldwell", "Caldwell", "Caldwell Parish", "Caledonia", "Calhoun",
    "Calhoun", "Calhoun", "Calhoun", "Calhoun", "Calhoun", "Calhoun",
    "Calhoun", "Calhoun", "Calhoun", "Calhoun", "Callahan", "Callaway",
    "Calloway", "Calumet", "Calvert", "Camas", "Cambria", "Camden", "Camden",
    "Camden", "Camden", "Cameron", "Cameron", "Cameron Parish", "Camp",
    "Campbell", "Campbell", "Campbell", "Campbell", "Campbell", "Camuy",
    "Canadian", "Candler", "Cannon", "Canyon", "Canóvanas", "Cape Girardeau",
    "Cape May", "Carbon", "Carbon", "Carbon", "Carbon", "Caribou", "Carlisle",
    "Carlton", "Carolina", "Caroline", "Caroline", "Carroll", "Carroll",
    "Carroll", "Carroll", "Carroll", "Carroll", "Carroll", "Carroll",
    "Carroll", "Carroll", "Carroll", "Carroll", "Carroll", "Carson City",
    "Carson", "Carter", "Carter", "Carter", "Carter", "Carter", "Carteret",
    "Carver", "Cascade", "Casey", "Cass", "Cass", "Cass", "Cass", "Cass",
    "Cass", "Cass", "Cass", "Cass", "Cassia", "Castro", "Caswell",
    "Catahoula Parish", "Catawba", "Cataño", "Catoosa", "Catron",
    "Cattaraugus", "Cavalier", "Cayey", "Cayuga", "Cecil", "Cedar", "Cedar",
    "Cedar", "Ceiba", "Centre", "Cerro Gordo", "Chaffee", "Chambers",
    "Chambers", "Champaign", "Champaign", "Chariton", "Charles City",
    "Charles", "Charles Mix", "Charleston", "Charlevoix", "Charlotte",
    "Charlotte", "Charlottesville", "Charlton", "Chase", "Chase", "Chatham",
    "Chatham", "Chattahoochee", "Chattooga", "Chautauqua", "Chautauqua",
    "Chaves", "Cheatham", "Cheboygan", "Chelan", "Chemung", "Chenango",
    "Cherokee", "Cherokee", "Cherokee", "Cherokee", "Cherokee", "Cherokee",
    "Cherokee", "Cherokee", "Cherry", "Chesapeake", "Cheshire", "Chester",
    "Chester", "Chester", "Chesterfield", "Chesterfield", "Cheyenne",
    "Cheyenne", "Cheyenne", "Chickasaw", "Chickasaw", "Chicot", "Childress",
    "Chilton", "Chippewa", "Chippewa", "Chippewa", "Chisago", "Chittenden",
    "Choctaw", "Choctaw", "Choctaw", "Chouteau", "Chowan", "Christian",
    "Christian", "Christian", "Chugach Census Area", "Churchill", "Ciales",
    "Cibola", "Cidra", "Cimarron", "Citrus", "Clackamas", "Claiborne",
    "Claiborne", "Claiborne Parish", "Clallam", "Clare", "Clarendon",
    "Clarion", "Clark", "Clark", "Clark", "Clark", "Clark", "Clark", "Clark",
    "Clark", "Clark", "Clark", "Clark", "Clark", "Clarke", "Clarke", "Clarke",
    "Clarke", "Clarke", "Clatsop", "Clay", "Clay", "Clay", "Clay", "Clay",
    "Clay", "Clay", "Clay", "Clay", "Clay", "Clay", "Clay", "Clay", "Clay",
    "Clay", "Clay", "Clay", "Clay", "Clayton", "Clayton", "Clear Creek",
    "Clearfield", "Clearwater", "Clearwater", "Cleburne", "Cleburne",
    "Clermont", "Cleveland", "Cleveland", "Cleveland", "Clinch", "Clinton",
    "Clinton", "Clinton", "Clinton", "Clinton", "Clinton", "Clinton",
    "Clinton", "Clinton", "Cloud", "Coahoma", "Coal", "Coamo", "Cobb",
    "Cochise", "Cochran", "Cocke", "Coconino", "Codington", "Coffee", "Coffee",
    "Coffee", "Coffey", "Coke", "Colbert", "Cole", "Coleman", "Coles",
    "Colfax", "Colfax", "Colleton", "Collier", "Collin", "Collingsworth",
    "Colonial Heights", "Colorado", "Colquitt", "Columbia", "Columbia",
    "Columbia", "Columbia", "Columbia", "Columbia", "Columbia", "Columbia",
    "Columbiana", "Columbus", "Colusa", "Comal", "Comanche", "Comanche",
    "Comanche", "Comerío", "Concho", "Concordia Parish", "Conecuh", "Conejos",
    "Contra Costa", "Converse", "Conway", "Cook", "Cook", "Cook", "Cooke",
    "Cooper", "Coos", "Coosa", "Copiah", "Copper River Census Area", "Corozal",
    "Corson", "Cortland", "Coryell", "Coshocton", "Costilla", "Cottle",
    "Cotton", "Cottonwood", "Covington", "Covington", "Covington", "Coweta",
    "Cowley", "Cowlitz", "Coös", "Craig", "Craig", "Craighead", "Crane",
    "Craven", "Crawford", "Crawford", "Crawford", "Crawford", "Crawford",
    "Crawford", "Crawford", "Crawford", "Crawford", "Crawford", "Crawford",
    "Creek", "Crenshaw", "Crisp", "Crittenden", "Crittenden", "Crockett",
    "Crockett", "Crook", "Crook", "Crosby", "Cross", "Crow Wing", "Crowley",
    "Culberson", "Culebra", "Cullman", "Culpeper", "Cumberland", "Cumberland",
    "Cumberland", "Cumberland", "Cumberland", "Cumberland", "Cumberland",
    "Cumberland", "Cuming", "Currituck", "Curry", "Curry", "Custer", "Custer",
    "Custer", "Custer", "Custer", "Custer", "Cuyahoga", "Dade", "Dade",
    "Daggett", "Dakota", "Dakota", "Dale", "Dallam", "Dallas", "Dallas",
    "Dallas", "Dallas", "Dallas", "Dane", "Daniels", "Danville", "Dare",
    "Darke", "Darlington", "Dauphin", "Davidson", "Davidson", "Davie",
    "Daviess", "Daviess", "Daviess", "Davis", "Davis", "Davison", "Dawes",
    "Dawson", "Dawson", "Dawson", "Dawson", "Day", "De Baca", "Deaf Smith",
    "Dearborn", "Decatur", "Decatur", "Decatur", "Decatur", "Decatur",
    "Deer Lodge", "Defiance", "DeKalb", "DeKalb", "DeKalb", "DeKalb", "DeKalb",
    "DeKalb", "Del Norte", "Delaware", "Delaware", "Delaware", "Delaware",
    "Delaware", "Delaware", "Delta", "Delta", "Delta", "Denali Borough",
    "Dent", "Denton", "Denver", "Des Moines", "Deschutes", "Desha", "DeSoto",
    "DeSoto", "DeSoto Parish", "Deuel", "Deuel", "Dewey", "Dewey", "DeWitt",
    "DeWitt", "Dickens", "Dickenson", "Dickey", "Dickinson", "Dickinson",
    "Dickinson", "Dickson", "Dillingham Census Area", "Dillon", "Dimmit",
    "Dinwiddie", "Divide", "Dixie", "Dixon", "Doddridge", "Dodge", "Dodge",
    "Dodge", "Dodge", "Dolores", "Doniphan", "Donley", "Dooly", "Door",
    "Dorado", "Dorchester", "Dorchester", "Dougherty", "Douglas", "Douglas",
    "Douglas", "Douglas", "Douglas", "Douglas", "Douglas", "Douglas",
    "Douglas", "Douglas", "Douglas", "Douglas", "Doña Ana", "Drew", "Dubois",
    "Dubuque", "Duchesne", "Dukes", "Dundy", "Dunklin", "Dunn", "Dunn",
    "DuPage", "Duplin", "Durham", "Dutchess", "Duval", "Duval", "Dyer",
    "Eagle", "Early", "East Baton Rouge Parish", "East Carroll Parish",
    "East Feliciana Parish", "Eastern District", "Eastland", "Eaton",
    "Eau Claire", "Echols", "Ector", "Eddy", "Eddy", "Edgar", "Edgecombe",
    "Edgefield", "Edmonson", "Edmunds", "Edwards", "Edwards", "Edwards",
    "Effingham", "Effingham", "El Dorado", "El Paso", "El Paso", "Elbert",
    "Elbert", "Elk", "Elk", "Elkhart", "Elko", "Elliott", "Ellis", "Ellis",
    "Ellis", "Ellsworth", "Elmore", "Elmore", "Emanuel", "Emery", "Emmet",
    "Emmet", "Emmons", "Emporia", "entity", "Erath", "Erie", "Erie", "Erie",
    "Escambia", "Escambia", "Esmeralda", "Essex", "Essex", "Essex", "Essex",
    "Essex", "Estill", "Etowah", "Eureka", "Evangeline Parish", "Evans",
    "Fairbanks North Star Borough", "Fairfax", "Fairfax", "Fairfield",
    "Fairfield", "Fairfield", "Fajardo", "Fall River", "Fallon",
    "Falls Church", "Falls", "Fannin", "Fannin", "Faribault", "Faulk",
    "Faulkner", "Fauquier", "Fayette", "Fayette", "Fayette", "Fayette",
    "Fayette", "Fayette", "Fayette", "Fayette", "Fayette", "Fayette",
    "Fayette", "Fentress", "Fergus", "Ferry", "Fillmore", "Fillmore", "Finney",
    "Fisher", "Flagler", "Flathead", "Fleming", "Florence", "Florence",
    "Florida", "Floyd", "Floyd", "Floyd", "Floyd", "Floyd", "Floyd",
    "Fluvanna", "Foard", "Fond du Lac", "Ford", "Ford", "Forest", "Forest",
    "Forrest", "Forsyth", "Forsyth", "Fort Bend", "Foster", "Fountain",
    "Franklin", "Franklin", "Franklin", "Franklin", "Franklin", "Franklin",
    "Franklin", "Franklin", "Franklin", "Franklin", "Franklin", "Franklin",
    "Franklin", "Franklin", "Franklin", "Franklin", "Franklin", "Franklin",
    "Franklin", "Franklin", "Franklin", "Franklin", "Franklin", "Franklin",
    "Franklin", "Franklin Parish", "Frederick", "Frederick", "Fredericksburg",
    "Freeborn", "Freestone", "Fremont", "Fremont", "Fremont", "Fremont",
    "Fresno", "Frio", "Frontier", "Fulton", "Fulton", "Fulton", "Fulton",
    "Fulton", "Fulton", "Fulton", "Fulton", "Furnas", "Gadsden", "Gage",
    "Gaines", "Galax", "Gallatin", "Gallatin", "Gallatin", "Gallia",
    "Galveston", "Garden", "Garfield", "Garfield", "Garfield", "Garfield",
    "Garfield", "Garfield", "Garland", "Garrard", "Garrett", "Garvin", "Garza",
    "Gasconade", "Gaston", "Gates", "Geary", "Geauga", "Gem", "Genesee",
    "Genesee", "Geneva", "Gentry", "George", "Georgetown", "Gibson", "Gibson",
    "Gila", "Gilchrist", "Giles", "Giles", "Gillespie", "Gilliam", "Gilmer",
    "Gilmer", "Gilpin", "Glacier", "Glades", "Gladwin", "Glascock",
    "Glasscock", "Glenn", "Gloucester", "Gloucester", "Glynn", "Gogebic",
    "Golden Valley", "Golden Valley", "Goliad", "Gonzales", "Goochland",
    "Goodhue", "Gooding", "Gordon", "Goshen", "Gosper", "Gove", "Grady",
    "Grady", "Grafton", "Graham", "Graham", "Graham", "Grainger", "Grand",
    "Grand", "Grand Forks", "Grand Isle", "Grand Traverse", "Granite", "Grant",
    "Grant", "Grant", "Grant", "Grant", "Grant", "Grant", "Grant", "Grant",
    "Grant", "Grant", "Grant", "Grant", "Grant", "Grant Parish", "Granville",
    "Gratiot", "Graves", "Gray", "Gray", "Grays Harbor", "Grayson", "Grayson",
    "Grayson", "Greeley", "Greeley", "Green", "Green", "Green Lake",
    "Greenbrier", "Greene", "Greene", "Greene", "Greene", "Greene", "Greene",
    "Greene", "Greene", "Greene", "Greene", "Greene", "Greene", "Greene",
    "Greene", "Greenlee", "Greensville", "Greenup", "Greenville", "Greenwood",
    "Greenwood", "Greer", "Gregg", "Gregory", "Grenada", "Griggs", "Grimes",
    "Grundy", "Grundy", "Grundy", "Grundy", "Guadalupe", "Guadalupe",
    "Guayama", "Guayanilla", "Guaynabo", "Guernsey", "Guilford", "Gulf",
    "Gunnison", "Gurabo", "Guthrie", "Guánica", "Gwinnett", "Haakon",
    "Habersham", "Haines Borough", "Hale", "Hale", "Halifax", "Halifax",
    "Hall", "Hall", "Hall", "Hamblen", "Hamilton", "Hamilton", "Hamilton",
    "Hamilton", "Hamilton", "Hamilton", "Hamilton", "Hamilton", "Hamilton",
    "Hamilton", "Hamlin", "Hampden", "Hampshire", "Hampshire", "Hampton",
    "Hampton", "Hancock", "Hancock", "Hancock", "Hancock", "Hancock",
    "Hancock", "Hancock", "Hancock", "Hancock", "Hancock", "Hand", "Hanover",
    "Hansford", "Hanson", "Haralson", "Hardee", "Hardeman", "Hardeman",
    "Hardin", "Hardin", "Hardin", "Hardin", "Hardin", "Hardin", "Harding",
    "Harding", "Hardy", "Harford", "Harlan", "Harlan", "Harmon", "Harnett",
    "Harney", "Harper", "Harper", "Harris", "Harris", "Harrison", "Harrison",
    "Harrison", "Harrison", "Harrison", "Harrison", "Harrison", "Harrison",
    "Harrisonburg", "Hart", "Hart", "Hartford", "Hartley", "Harvey", "Haskell",
    "Haskell", "Haskell", "Hatillo", "Hawaii", "Hawkins", "Hayes", "Hays",
    "Haywood", "Haywood", "Heard", "Hemphill", "Hempstead", "Henderson",
    "Henderson", "Henderson", "Henderson", "Henderson", "Hendricks", "Hendry",
    "Hennepin", "Henrico", "Henry", "Henry", "Henry", "Henry", "Henry",
    "Henry", "Henry", "Henry", "Henry", "Henry", "Herkimer", "Hernando",
    "Hertford", "Hettinger", "Hickman", "Hickman", "Hickory", "Hidalgo",
    "Hidalgo", "Highland", "Highland", "Highlands", "Hill", "Hill",
    "Hillsborough", "Hillsborough", "Hillsdale", "Hinds", "Hinsdale",
    "Hitchcock", "Hocking", "Hockley", "Hodgeman", "Hoke", "Holmes", "Holmes",
    "Holmes", "Holt", "Holt", "Honolulu", "Hood", "Hood River", "Hooker",
    "Hoonah–Angoon Census Area", "Hopewell", "Hopkins", "Hopkins",
    "Hormigueros", "Horry", "Hot Spring", "Hot Springs", "Houghton", "Houston",
    "Houston", "Houston", "Houston", "Houston", "Howard", "Howard", "Howard",
    "Howard", "Howard", "Howard", "Howard", "Howell", "Howland Island",
    "Hubbard", "Hudson", "Hudspeth", "Huerfano", "Hughes", "Hughes", "Humacao",
    "Humboldt", "Humboldt", "Humboldt", "Humphreys", "Humphreys", "Hunt",
    "Hunterdon", "Huntingdon", "Huntington", "Huron", "Huron", "Hutchinson",
    "Hutchinson", "Hyde", "Hyde", "Iberia Parish", "Iberville Parish", "Ida",
    "Idaho", "Imperial", "Independence", "Indian River", "Indiana", "Ingham",
    "Inyo", "Ionia", "Iosco", "Iowa", "Iowa", "Iredell", "Irion", "Iron",
    "Iron", "Iron", "Iron", "Iroquois", "Irwin", "Isabela", "Isabella",
    "Isanti", "Island", "Isle of Wight", "Issaquena", "Itasca", "Itawamba",
    "Izard", "Jack", "Jackson", "Jackson", "Jackson", "Jackson", "Jackson",
    "Jackson", "Jackson", "Jackson", "Jackson", "Jackson", "Jackson",
    "Jackson", "Jackson", "Jackson", "Jackson", "Jackson", "Jackson",
    "Jackson", "Jackson", "Jackson", "Jackson", "Jackson", "Jackson",
    "Jackson Parish", "James City", "Jarvis Island", "Jasper", "Jasper",
    "Jasper", "Jasper", "Jasper", "Jasper", "Jasper", "Jasper", "Jay",
    "Jayuya", "Jeff Davis", "Jeff Davis", "Jefferson", "Jefferson",
    "Jefferson", "Jefferson", "Jefferson", "Jefferson", "Jefferson",
    "Jefferson", "Jefferson", "Jefferson", "Jefferson", "Jefferson",
    "Jefferson", "Jefferson", "Jefferson", "Jefferson", "Jefferson",
    "Jefferson", "Jefferson", "Jefferson", "Jefferson", "Jefferson",
    "Jefferson", "Jefferson", "Jefferson", "Jefferson Davis",
    "Jefferson Davis Parish", "Jefferson Parish", "Jenkins", "Jennings",
    "Jerauld", "Jerome", "Jersey", "Jessamine", "Jewell", "Jim Hogg",
    "Jim Wells", "Jo Daviess", "Johnson", "Johnson", "Johnson", "Johnson",
    "Johnson", "Johnson", "Johnson", "Johnson", "Johnson", "Johnson", "Johnson",
    "Johnson", "Johnston Atoll", "Johnston", "Johnston", "Jones", "Jones",
    "Jones", "Jones", "Jones", "Jones", "Josephine", "Juab", "Juana Díaz",
    "Judith Basin", "Juncos", "Juneau", "Juneau", "Juniata", "Kalamazoo",
    "Kalawao", "Kalkaska", "Kanabec", "Kanawha", "Kandiyohi", "Kane", "Kane",
    "Kankakee", "Karnes", "Kauai", "Kaufman", "Kay", "Kearney", "Kearny",
    "Keith", "Kemper", "Kenai Peninsula Borough", "Kendall", "Kendall",
    "Kenedy", "Kennebec", "Kenosha", "Kent", "Kent", "Kent", "Kent", "Kent",
    "Kenton", "Keokuk", "Kern", "Kerr", "Kershaw", "Ketchikan Gateway Borough",
    "Kewaunee", "Keweenaw", "Keya Paha", "Kidder", "Kimball", "Kimble",
    "King and Queen", "King", "King", "King George", "King William",
    "Kingfisher", "Kingman", "Kingman Reef", "Kings", "Kingsbury", "Kinney",
    "Kiowa", "Kiowa", "Kiowa", "Kit Carson", "Kitsap", "Kittitas", "Kittson",
    "Klamath", "Kleberg", "Klickitat", "Knott", "Knox", "Knox", "Knox", "Knox",
    "Knox", "Knox", "Knox", "Knox", "Knox", "Kodiak Island Borough",
    "Koochiching", "Kootenai", "Kosciusko", "Kossuth", "Kusilvak Census Area",
    "La Crosse", "La Paz", "La Plata", "La Salle", "La Salle Parish",
    "Labette", "Lac qui Parle", "Lackawanna", "Laclede", "Lafayette",
    "Lafayette", "Lafayette", "Lafayette", "Lafayette", "Lafayette Parish",
    "Lafourche Parish", "LaGrange", "Lajas", "Lake and Peninsula Borough",
    "Lake", "Lake", "Lake", "Lake", "Lake", "Lake", "Lake", "Lake", "Lake",
    "Lake", "Lake", "Lake", "Lake of the Woods", "Lamar", "Lamar", "Lamar",
    "Lamar", "Lamb", "Lamoille", "LaMoure", "Lampasas", "Lancaster",
    "Lancaster", "Lancaster", "Lancaster", "Lander", "Lane", "Lane",
    "Langlade", "Lanier", "Lapeer", "LaPorte", "Laramie", "Lares", "Larimer",
    "LaRue", "Las Animas", "Las Marías", "Las Piedras", "LaSalle", "Lassen",
    "Latah", "Latimer", "Lauderdale", "Lauderdale", "Lauderdale", "Laurel",
    "Laurens", "Laurens", "Lavaca", "Lawrence", "Lawrence", "Lawrence",
    "Lawrence", "Lawrence", "Lawrence", "Lawrence", "Lawrence", "Lawrence",
    "Lawrence", "Lawrence", "Le Flore", "Le Sueur", "Lea", "Leake",
    "Leavenworth", "Lebanon", "Lee", "Lee", "Lee", "Lee", "Lee", "Lee", "Lee",
    "Lee", "Lee", "Lee", "Lee", "Lee", "Leelanau", "Leflore", "Lehigh",
    "Lemhi", "Lenawee", "Lenoir", "Leon", "Leon", "Leslie", "Letcher", "Levy",
    "Lewis and Clark", "Lewis", "Lewis", "Lewis", "Lewis", "Lewis", "Lewis",
    "Lewis", "Lexington", "Lexington", "Liberty", "Liberty", "Liberty",
    "Liberty", "Licking", "Limestone", "Limestone", "Lincoln", "Lincoln",
    "Lincoln", "Lincoln", "Lincoln", "Lincoln", "Lincoln", "Lincoln",
    "Lincoln", "Lincoln", "Lincoln", "Lincoln", "Lincoln", "Lincoln",
    "Lincoln", "Lincoln", "Lincoln", "Lincoln", "Lincoln", "Lincoln",
    "Lincoln", "Lincoln", "Lincoln", "Lincoln Parish", "Linn", "Linn", "Linn",
    "Linn", "Lipscomb", "Litchfield", "Little River", "Live Oak", "Livingston",
    "Livingston", "Livingston", "Livingston", "Livingston",
    "Livingston Parish", "Llano", "Logan", "Logan", "Logan", "Logan", "Logan",
    "Logan", "Logan", "Logan", "Logan", "Logan", "Long", "Lonoke", "Lorain",
    "Los Alamos", "Los Angeles", "Loudon", "Loudoun", "Louisa", "Louisa",
    "Loup", "Love", "Loving", "Lowndes", "Lowndes", "Lowndes", "Loíza",
    "Lubbock", "Lucas", "Lucas", "Luce", "Lumpkin", "Luna", "Lunenburg",
    "Luquillo", "Luzerne", "Lycoming", "Lyman", "Lynchburg", "Lynn", "Lyon",
    "Lyon", "Lyon", "Lyon", "Lyon", "Mackinac", "Macomb", "Macon", "Macon",
    "Macon", "Macon", "Macon", "Macon", "Macoupin", "Madera", "Madison",
    "Madison", "Madison", "Madison", "Madison", "Madison", "Madison",
    "Madison", "Madison", "Madison", "Madison", "Madison", "Madison",
    "Madison", "Madison", "Madison", "Madison", "Madison", "Madison",
    "Madison Parish", "Magoffin", "Mahaska", "Mahnomen", "Mahoning", "Major",
    "Malheur", "Manassas", "Manassas Park", "Manatee", "Manatí", "Manhattan",
    "Manistee", "Manitowoc", "Manu'a District", "Marathon", "Marengo",
    "Maricao", "Maricopa", "Maries", "Marin", "Marinette", "Marion", "Marion",
    "Marion", "Marion", "Marion", "Marion", "Marion", "Marion", "Marion",
    "Marion", "Marion", "Marion", "Marion", "Marion", "Marion", "Marion",
    "Marion", "Mariposa", "Marlboro", "Marquette", "Marquette", "Marshall",
    "Marshall", "Marshall", "Marshall", "Marshall", "Marshall", "Marshall",
    "Marshall", "Marshall", "Marshall", "Marshall", "Marshall", "Martin",
    "Martin", "Martin", "Martin", "Martin", "Martin", "Martinsville", "Mason",
    "Mason", "Mason", "Mason", "Mason", "Mason", "Massac", "Matagorda",
    "Matanuska-Susitna Borough", "Mathews", "Maui", "Maunabo", "Maury",
    "Maverick", "Mayagüez", "Mayes", "McClain", "McCone", "McCook",
    "McCormick", "McCracken", "McCreary", "McCulloch", "McCurtain", "McDonald",
    "McDonough", "McDowell", "McDowell", "McDuffie", "McHenry", "McHenry",
    "McIntosh", "McIntosh", "McIntosh", "McKean", "McKenzie", "McKinley",
    "McLean", "McLean", "McLean", "McLennan", "McLeod", "McMinn", "McMullen",
    "McNairy", "McPherson", "McPherson", "McPherson", "Meade", "Meade",
    "Meade", "Meagher", "Mecklenburg", "Mecklenburg", "Mecosta", "Medina",
    "Medina", "Meeker", "Meigs", "Meigs", "Mellette", "Menard", "Menard",
    "Mendocino", "Menifee", "Menominee", "Menominee", "Merced", "Mercer",
    "Mercer", "Mercer", "Mercer", "Mercer", "Mercer", "Mercer", "Mercer",
    "Meriwether", "Merrick", "Merrimack", "Mesa", "Metcalfe", "Miami", "Miami",
    "Miami", "Miami-Dade", "Middlesex", "Middlesex", "Middlesex", "Middlesex",
    "Midland", "Midland", "Midway Atoll", "Mifflin", "Milam", "Millard",
    "Mille Lacs", "Miller", "Miller", "Miller", "Mills", "Mills", "Milwaukee",
    "Miner", "Mineral", "Mineral", "Mineral", "Mineral", "Mingo", "Minidoka",
    "Minnehaha", "Missaukee", "Mississippi", "Mississippi", "Missoula",
    "Mitchell", "Mitchell", "Mitchell", "Mitchell", "Mitchell", "Mobile",
    "Moca", "Modoc", "Moffat", "Mohave", "Moniteau", "Monmouth", "Mono",
    "Monona", "Monongalia", "Monroe", "Monroe", "Monroe", "Monroe", "Monroe",
    "Monroe", "Monroe", "Monroe", "Monroe", "Monroe", "Monroe", "Monroe",
    "Monroe", "Monroe", "Monroe", "Monroe", "Monroe", "Montague", "Montcalm",
    "Monterey", "Montezuma", "Montgomery", "Montgomery", "Montgomery",
    "Montgomery", "Montgomery", "Montgomery", "Montgomery", "Montgomery",
    "Montgomery", "Montgomery", "Montgomery", "Montgomery", "Montgomery",
    "Montgomery", "Montgomery", "Montgomery", "Montgomery", "Montgomery",
    "Montmorency", "Montour", "Montrose", "Moody", "Moore", "Moore", "Moore",
    "Mora", "Morehouse Parish", "Morgan", "Morgan", "Morgan", "Morgan",
    "Morgan", "Morgan", "Morgan", "Morgan", "Morgan", "Morgan", "Morgan",
    "Morovis", "Morrill", "Morris", "Morris", "Morris", "Morrison", "Morrow",
    "Morrow", "Morton", "Morton", "Motley", "Moultrie", "Mountrail", "Mower",
    "Muhlenberg", "Multnomah", "Murray", "Murray", "Murray", "Muscatine",
    "Muscogee", "Muskegon", "Muskingum", "Muskogee", "Musselshell",
    "Nacogdoches", "Naguabo", "Nance", "Nantucket", "Napa", "Naranjito",
    "Nash", "Nassau", "Nassau", "Natchitoches Parish", "Natrona", "Navajo",
    "Navarro", "Navassa Island", "Nelson", "Nelson", "Nelson", "Nemaha",
    "Nemaha", "Neosho", "Neshoba", "Ness", "Nevada", "Nevada", "New Castle",
    "New Hanover", "New Haven", "New Kent", "New London", "New Madrid",
    "New York", "Newaygo", "Newberry", "Newport", "Newport News", "Newton",
    "Newton", "Newton", "Newton", "Newton", "Newton", "Nez Perce", "Niagara",
    "Nicholas", "Nicholas", "Nicollet", "Niobrara", "Noble", "Noble", "Noble",
    "Nobles", "Nodaway", "Nolan", "Nome Census Area", "Norfolk", "Norfolk",
    "Norman", "North Slope Borough", "Northampton", "Northampton",
    "Northampton", "Northern Islands Municipality", "Northumberland",
    "Northumberland", "Northwest Arctic Borough", "Norton", "Norton",
    "Nottoway", "Nowata", "Noxubee", "Nuckolls", "Nueces", "Nye", "O'Brien",
    "Oakland", "Obion", "Ocean", "Oceana", "Ochiltree", "Oconee", "Oconee",
    "Oconto", "Ogemaw", "Oglala Lakota", "Ogle", "Oglethorpe", "Ohio", "Ohio",
    "Ohio", "Okaloosa", "Okanogan", "Okeechobee", "Okfuskee", "Oklahoma",
    "Okmulgee", "Oktibbeha", "Oldham", "Oldham", "Oliver", "Olmsted", "Oneida",
    "Oneida", "Oneida", "Onondaga", "Onslow", "Ontario", "Ontonagon", "Orange",
    "Orange", "Orange", "Orange", "Orange", "Orange", "Orange", "Orange",
    "Orangeburg", "Oregon", "Orleans", "Orleans", "Orleans Parish", "Orocovis",
    "Osage", "Osage", "Osage", "Osborne", "Osceola", "Osceola", "Osceola",
    "Oscoda", "Oswego", "Otero", "Otero", "Otoe", "Otsego", "Otsego", "Ottawa",
    "Ottawa", "Ottawa", "Ottawa", "Otter Tail", "Ouachita", "Ouachita Parish",
    "Ouray", "Outagamie", "Overton", "Owen", "Owen", "Owsley", "Owyhee",
    "Oxford", "Ozark", "Ozaukee", "Pacific", "Page", "Page", "Palm Beach",
    "Palmyra Atoll", "Palo Alto", "Palo Pinto", "Pamlico", "Panola", "Panola",
    "Park", "Park", "Park", "Parke", "Parker", "Parmer", "Pasco", "Pasquotank",
    "Passaic", "Patillas", "Patrick", "Paulding", "Paulding", "Pawnee",
    "Pawnee", "Pawnee", "Payette", "Payne", "Peach", "Pearl River", "Pecos",
    "Pembina", "Pemiscot", "Pend Oreille", "Pender", "Pendleton", "Pendleton",
    "Pennington", "Pennington", "Penobscot", "Peoria", "Pepin", "Perkins",
    "Perkins", "Perquimans", "Perry", "Perry", "Perry", "Perry", "Perry",
    "Perry", "Perry", "Perry", "Perry", "Perry", "Pershing", "Person",
    "Petersburg", "Petersburg Borough", "Petroleum", "Pettis", "Peñuelas",
    "Phelps", "Phelps", "Philadelphia", "Phillips", "Phillips", "Phillips",
    "Phillips", "Piatt", "Pickaway", "Pickens", "Pickens", "Pickens",
    "Pickett", "Pierce", "Pierce", "Pierce", "Pierce", "Pierce", "Pike",
    "Pike", "Pike", "Pike", "Pike", "Pike", "Pike", "Pike", "Pike", "Pike",
    "Pima", "Pinal", "Pine", "Pinellas", "Pipestone", "Piscataquis", "Pitkin",
    "Pitt", "Pittsburg", "Pittsylvania", "Piute", "Placer",
    "Plaquemines Parish", "Platte", "Platte", "Platte", "Pleasants", "Plumas",
    "Plymouth", "Plymouth", "Pocahontas", "Pocahontas", "Poinsett",
    "Pointe Coupee Parish", "Polk", "Polk", "Polk", "Polk", "Polk", "Polk",
    "Polk", "Polk", "Polk", "Polk", "Polk", "Polk", "Ponce", "Pondera",
    "Pontotoc", "Pontotoc", "Pope", "Pope", "Pope", "Poquoson", "Portage",
    "Portage", "Porter", "Portsmouth", "Posey", "Pottawatomie", "Pottawatomie",
    "Pottawattamie", "Potter", "Potter", "Potter", "Powder River", "Powell",
    "Powell", "Power", "Poweshiek", "Powhatan", "Prairie", "Prairie", "Pratt",
    "Preble", "Prentiss", "Presidio", "Presque Isle", "Preston", "Price",
    "Prince Edward", "Prince George", "Prince George's",
    "Prince of Wales–Hyder Census Area", "Prince William", "Providence",
    "Prowers", "Pueblo", "Pulaski", "Pulaski", "Pulaski", "Pulaski", "Pulaski",
    "Pulaski", "Pulaski", "Pushmataha", "Putnam", "Putnam", "Putnam", "Putnam",
    "Putnam", "Putnam", "Putnam", "Putnam", "Putnam", "Quay", "Quebradillas",
    "Queen Anne's", "Queens", "Quitman", "Quitman", "Rabun", "Racine",
    "Radford", "Rains", "Raleigh", "Ralls", "Ramsey", "Ramsey", "Randall",
    "Randolph", "Randolph", "Randolph", "Randolph", "Randolph", "Randolph",
    "Randolph", "Randolph", "Rankin", "Ransom", "Rapides Parish",
    "Rappahannock", "Ravalli", "Rawlins", "Ray", "Reagan", "Real", "Red Lake",
    "Red River", "Red River Parish", "Red Willow", "Redwood", "Reeves",
    "Refugio", "Reno", "Rensselaer", "Renville", "Renville", "Republic",
    "Reynolds", "Rhea", "Rice", "Rice", "Rich", "Richardson", "Richland",
    "Richland", "Richland", "Richland", "Richland", "Richland",
    "Richland Parish", "Richmond", "Richmond", "Richmond", "Richmond", "Riley",
    "Rincón", "Ringgold", "Rio Arriba", "Rio Blanco", "Rio Grande", "Ripley",
    "Ripley", "Ritchie", "Riverside", "Roane", "Roane", "Roanoke", "Roanoke",
    "Roberts", "Roberts", "Robertson", "Robertson", "Robertson", "Robeson",
    "Rock", "Rock", "Rock", "Rock Island", "Rockbridge", "Rockcastle",
    "Rockdale", "Rockingham", "Rockingham", "Rockingham", "Rockland",
    "Rockwall", "Roger Mills", "Rogers", "Rolette", "Rooks", "Roosevelt",
    "Roosevelt", "Roscommon", "Rose Atoll", "Roseau", "Rosebud", "Ross",
    "Rota", "Routt", "Rowan", "Rowan", "Runnels", "Rush", "Rush", "Rusk",
    "Rusk", "Russell", "Russell", "Russell", "Russell", "Rutherford",
    "Rutherford", "Rutland", "Río Grande", "Sabana Grande", "Sabine",
    "Sabine Parish", "Sac", "Sacramento", "Sagadahoc", "Saginaw", "Saguache",
    "Saint Croix", "Saint John", "Saint Thomas", "Saipan", "Salem", "Salem",
    "Salinas", "Saline", "Saline", "Saline", "Saline", "Saline", "Salt Lake",
    "Saluda", "Sampson", "San Augustine", "San Benito", "San Bernardino",
    "San Diego", "San Francisco", "San Germán", "San Jacinto", "San Joaquin",
    "San Juan", "San Juan", "San Juan", "San Juan", "San Juan", "San Lorenzo",
    "San Luis Obispo", "San Mateo", "San Miguel", "San Miguel", "San Patricio",
    "San Saba", "San Sebastián", "Sanborn", "Sanders", "Sandoval", "Sandusky",
    "Sangamon", "Sanilac", "Sanpete", "Santa Barbara", "Santa Clara",
    "Santa Cruz", "Santa Cruz", "Santa Fe", "Santa Isabel", "Santa Rosa",
    "Sarasota", "Saratoga", "Sargent", "Sarpy", "Sauk", "Saunders", "Sawyer",
    "Schenectady", "Schleicher", "Schley", "Schoharie", "Schoolcraft",
    "Schuyler", "Schuyler", "Schuyler", "Schuylkill", "Scioto", "Scotland",
    "Scotland", "Scott", "Scott", "Scott", "Scott", "Scott", "Scott", "Scott",
    "Scott", "Scott", "Scott", "Scott", "Scotts Bluff", "Screven", "Scurry",
    "Searcy", "Sebastian", "Sedgwick", "Sedgwick", "Seminole", "Seminole",
    "Seminole", "Seneca", "Seneca", "Sequatchie", "Sequoyah", "Sevier",
    "Sevier", "Sevier", "Seward", "Seward", "Shackelford", "Shannon",
    "Sharkey", "Sharp", "Shasta", "Shawano", "Shawnee", "Sheboygan", "Shelby",
    "Shelby", "Shelby", "Shelby", "Shelby", "Shelby", "Shelby", "Shelby",
    "Shelby", "Shenandoah", "Sherburne", "Sheridan", "Sheridan", "Sheridan",
    "Sheridan", "Sheridan", "Sherman", "Sherman", "Sherman", "Sherman",
    "Shiawassee", "Shoshone", "Sibley", "Sierra", "Sierra", "Silver Bow",
    "Simpson", "Simpson", "Sioux", "Sioux", "Sioux", "Siskiyou", "Sitka",
    "Skagit", "Skamania", "Slope", "Smith", "Smith", "Smith", "Smith", "Smyth",
    "Snohomish", "Snyder", "Socorro", "Solano", "Somerset", "Somerset",
    "Somerset", "Somerset", "Somervell", "Sonoma", "Southampton",
    "Southeast Fairbanks Census Area", "Spalding", "Spartanburg", "Spencer",
    "Spencer", "Spink", "Spokane", "Spotsylvania", "St. Bernard Parish",
    "St. Charles", "St. Charles Parish", "St. Clair", "St. Clair", "St. Clair",
    "St. Clair", "St. Croix", "St. Francis", "St. Francois",
    "St. Helena Parish", "St. James Parish", "St. John the Baptist Parish",
    "St. Johns", "St. Joseph", "St. Joseph", "St. Landry Parish",
    "St. Lawrence", "St. Louis", "St. Louis", "St. Louis", "St. Lucie",
    "St. Martin Parish", "St. Mary Parish", "St. Mary's", "St. Tammany Parish",
    "Stafford", "Stafford", "Stanislaus", "Stanley", "Stanly", "Stanton",
    "Stanton", "Stark", "Stark", "Stark", "Starke", "Starr", "Staten Island",
    "Staunton", "Ste. Genevieve", "Stearns", "Steele", "Steele", "Stephens",
    "Stephens", "Stephens", "Stephenson", "Sterling", "Steuben", "Steuben",
    "Stevens", "Stevens", "Stevens", "Stewart", "Stewart", "Stillwater",
    "Stoddard", "Stokes", "Stone", "Stone", "Stone", "Stonewall", "Storey",
    "Story", "Strafford", "Stutsman", "Sublette", "Suffolk", "Suffolk",
    "Suffolk", "Sullivan", "Sullivan", "Sullivan", "Sullivan", "Sullivan",
    "Sullivan", "Sully", "Summers", "Summit", "Summit", "Summit", "Sumner",
    "Sumner", "Sumter", "Sumter", "Sumter", "Sumter", "Sunflower", "Surry",
    "Surry", "Susquehanna", "Sussex", "Sussex", "Sussex", "Sutter", "Sutton",
    "Suwannee", "Swain", "Swains Island", "Sweet Grass", "Sweetwater", "Swift",
    "Swisher", "Switzerland", "Talbot", "Talbot", "Taliaferro", "Talladega",
    "Tallahatchie", "Tallapoosa", "Tama", "Taney", "Tangipahoa Parish", "Taos",
    "Tarrant", "Tate", "Tattnall", "Taylor", "Taylor", "Taylor", "Taylor",
    "Taylor", "Taylor", "Taylor", "Tazewell", "Tazewell", "Tehama", "Telfair",
    "Teller", "Tensas Parish", "Terrebonne Parish", "Terrell", "Terrell",
    "Terry", "Teton", "Teton", "Teton", "Texas", "Texas", "Thayer",
    "The Bronx", "Thomas", "Thomas", "Thomas", "Throckmorton", "Thurston",
    "Thurston", "Tift", "Tillamook", "Tillman", "Tinian Municipality", "Tioga",
    "Tioga", "Tippah", "Tippecanoe", "Tipton", "Tipton", "Tishomingo", "Titus",
    "Toa Alta", "Toa Baja", "Todd", "Todd", "Todd", "Tolland", "Tom Green",
    "Tompkins", "Tooele", "Toole", "Toombs", "Torrance", "Towner", "Towns",
    "Traill", "Transylvania", "Traverse", "Travis", "Treasure", "Trego",
    "Trempealeau", "Treutlen", "Trigg", "Trimble", "Trinity", "Trinity",
    "Tripp", "Troup", "Trousdale", "Trujillo Alto", "Trumbull", "Tucker",
    "Tulare", "Tulsa", "Tunica", "Tuolumne", "Turner", "Turner", "Tuscaloosa",
    "Tuscarawas", "Tuscola", "Twiggs", "Twin Falls", "Tyler", "Tyler",
    "Tyrrell", "Uinta", "Uintah", "Ulster", "Umatilla", "Unicoi", "Union",
    "Union", "Union", "Union", "Union", "Union", "Union", "Union", "Union",
    "Union", "Union", "Union", "Union", "Union", "Union", "Union", "Union",
    "Union Parish", "Upshur", "Upshur", "Upson", "Upton", "Utah", "Utuado",
    "Uvalde", "Val Verde", "Valdez–Cordova Census Area", "Valencia", "Valley",
    "Valley", "Valley", "Van Buren", "Van Buren", "Van Buren", "Van Buren",
    "Van Wert", "Van Zandt", "Vance", "Vanderburgh", "Vega Alta", "Vega Baja",
    "Venango", "Ventura", "Vermilion", "Vermilion Parish", "Vermillion",
    "Vernon", "Vernon", "Vernon Parish", "Victoria", "Vieques", "Vigo",
    "Vilas", "Villalba", "Vinton", "Virginia Beach", "Volusia", "Wabash",
    "Wabash", "Wabasha", "Wabaunsee", "Wadena", "Wagoner", "Wahkiakum", "Wake",
    "Wake Island", "Wakulla", "Waldo", "Walker", "Walker", "Walker",
    "Walla Walla", "Wallace", "Waller", "Wallowa", "Walsh", "Walthall",
    "Walton", "Walton", "Walworth", "Walworth", "Wapello", "Ward", "Ward",
    "Ware", "Warren", "Warren", "Warren", "Warren", "Warren", "Warren",
    "Warren", "Warren", "Warren", "Warren", "Warren", "Warren", "Warren",
    "Warren", "Warrick", "Wasatch", "Wasco", "Waseca", "Washakie", "Washburn",
    "Washington", "Washington", "Washington", "Washington", "Washington",
    "Washington", "Washington", "Washington", "Washington", "Washington",
    "Washington", "Washington", "Washington", "Washington", "Washington",
    "Washington", "Washington", "Washington", "Washington", "Washington",
    "Washington", "Washington", "Washington", "Washington", "Washington",
    "Washington", "Washington", "Washington", "Washington", "Washington",
    "Washington Parish", "Washington", "Washita", "Washoe", "Washtenaw",
    "Watauga", "Watonwan", "Waukesha", "Waupaca", "Waushara", "Wayne", "Wayne",
    "Wayne", "Wayne", "Wayne", "Wayne", "Wayne", "Wayne", "Wayne", "Wayne",
    "Wayne", "Wayne", "Wayne", "Wayne", "Wayne", "Wayne", "Waynesboro",
    "Weakley", "Webb", "Weber", "Webster", "Webster", "Webster", "Webster",
    "Webster", "Webster", "Webster", "Webster Parish", "Weld", "Wells",
    "Wells", "West Baton Rouge Parish", "West Carroll Parish",
    "West Feliciana Parish", "Westchester", "Western District", "Westmoreland",
    "Westmoreland", "Weston", "Wetzel", "Wexford", "Wharton", "Whatcom",
    "Wheatland", "Wheeler", "Wheeler", "Wheeler", "Wheeler", "White", "White",
    "White", "White", "White", "White Pine", "Whiteside", "Whitfield",
    "Whitley", "Whitley", "Whitman", "Wibaux", "Wichita", "Wichita",
    "Wicomico", "Wilbarger", "Wilcox", "Wilcox", "Wilkes", "Wilkes", "Wilkin",
    "Wilkinson", "Wilkinson", "Will", "Willacy", "Williams", "Williams",
    "Williamsburg", "Williamsburg", "Williamson", "Williamson", "Williamson",
    "Wilson", "Wilson", "Wilson", "Wilson", "Winchester", "Windham", "Windham",
    "Windsor", "Winkler", "Winn Parish", "Winnebago", "Winnebago", "Winnebago",
    "Winneshiek", "Winona", "Winston", "Winston", "Wirt", "Wise", "Wise",
    "Wolfe", "Wood", "Wood", "Wood", "Wood", "Woodbury", "Woodford", "Woodford",
    "Woodruff", "Woods", "Woodson", "Woodward", "Worcester", "Worcester",
    "Worth", "Worth", "Worth", "Wright", "Wright", "Wright", "Wyandot",
    "Wyandotte", "Wyoming", "Wyoming", "Wyoming", "Wythe", "Yabucoa", "Yadkin",
    "Yakima", "Yakutat", "Yalobusha", "Yamhill", "Yancey", "Yankton", "Yates",
    "Yauco", "Yavapai", "Yazoo", "Yell", "Yellow Medicine", "Yellowstone",
    "Yoakum", "Yolo", "York", "York", "York", "York", "York", "Young", "Yuba",
    "Yukon–Koyukuk Census Area", "Yuma", "Yuma", "Zapata", "Zavala", "Ziebach"
]

us_counties = [
    'dale', 'spencer', 'converse', 'cascade', 'yukon–koyukuk census area',
    'darlington', 'audubon', 'glades', 'okaloosa', 'patrick', 'hayes',
    'leslie', 'st. francis', 'garrett', 'barnwell', 'greensville', 'escambia',
    'emery', 'turner', 'cache', 'mcculloch', 'crisp', 'danville', 'stillwater',
    'hunterdon', 'sedgwick', 'culpeper', 'parmer', 'nacogdoches',
    'copper river census area', 'ellsworth', 'itawamba', 'ritchie', 'hardeman',
    'bear lake', 'galveston', 'el paso', 'wilkes', 'dawes', 'hamblen',
    'bristol', 'whatcom', 'dallas', 'shenandoah', 'wilkinson', 'huerfano',
    'humphreys', 'aguas buenas', 'cortland', 'goshen', 'winneshiek', 'geary',
    'warren', 'mineral', 'woodson', 'caledonia', 'pasquotank',
    'kusilvak census area', 'sweet grass', 'tangipahoa parish', 'ector',
    'johnston', 'burnet', 'sarpy', 'seneca', 'borden', 'sabine parish',
    'heard', 'barrow', 'brooklyn', 'macoupin', 'magoffin', 'deaf smith',
    'hamlin', 'pitt', 'edgefield', 'mississippi', 'mcdowell', 'atoka',
    'hemphill', 'wakulla', 'lee', 'oglethorpe', 'sebastian', 'henderson',
    'roberts', 'mahnomen', 'bay', 'robertson', 'effingham', 'wilkin', 'mohave',
    'fremont', 'towns', 'fauquier', 'guilford', 'amador', 'izard', 'yalobusha',
    'mahaska', 'meigs', 'calcasieu parish', 'hardin', 'keweenaw', 'antrim',
    'rolette', 'st. bernard parish', 'early', 'keith', 'allen', 'mills',
    'millard', 'wichita', 'barren', 'coal', 'perkins', 'klamath', 'rhea',
    'kanawha', 'anchorage', 'black hawk', 'morton', 'añasco', 'bandera',
    'wallace', 'plaquemines parish', 'muskogee', 'craighead', 'chesapeake',
    'cochise', 'worcester', 'skamania', 'fisher', 'powder river', 'arkansas',
    'washoe', 'ceiba', 'king', 'olmsted', 'clermont', 'rockwall', 'bullock',
    'buchanan', 'lunenburg', 'carroll', 'saguache', 'marathon', 'bradley',
    'nicholas', 'hampshire', 'merrimack', 'laurens', 'hale', 'tensas parish',
    'washakie', 'coryell', 'steele', 'rutherford', 'rice', 'manatee',
    'siskiyou', 'cobb', 'dixie', 'kalamazoo', 'midway atoll', 'pendleton',
    'roscommon', 'sanilac', 'kossuth', 'wabash', 'tillman', 'wilcox',
    'stoddard', 'lorain', 'putnam', 'orange', 'elko', 'atkinson', 'tompkins',
    'waller', 'tinian municipality', 'hopkins', 'appanoose',
    'southeast fairbanks census area', 'santa rosa', 'kingsbury', 'ionia',
    "st. mary's", 'ashley', 'rich', 'tama', 'mobile', 'pontotoc', 'fountain',
    'cheyenne', 'crockett', 'western district', 'wheatland', 'ada', 'meade',
    'yancey', 'riverside', 'corozal', 'mcpherson', 'lackawanna', 'zapata',
    'tripp', 'cowlitz', 'goliad', 'taylor', 'carlisle', 'dimmit',
    'roger mills', 'echols', 'cullman', 'humacao', 'sargent', 'pitkin',
    'cabo rojo', 'audrain', 'morris', 'darke', 'la salle parish', 'claiborne',
    'weld', 'trimble', 'gasconade', 'cherokee', 'snohomish', 'lares',
    'switzerland', 'phillips', 'nobles', 'cerro gordo', 'callahan', 'roane',
    'dupage', 'frontier', 'gonzales', 'phelps', 'pearl river', 'bexar',
    'haskell', 'box butte', 'holmes', 'hopewell', 'san luis obispo', 'wasco',
    'malheur', 'dodge', 'chambers', 'nolan', 'houghton', 'park', 'bingham',
    'merced', 'salem', 'walthall', 'dubuque', 'louisa', 'somervell', 'laramie',
    'irion', 'sutton', 'james city', 'allegheny', 'hitchcock', 'canyon',
    'schoharie', 'ouachita', 'summit', 'ste. genevieve', 'bailey', 'sanders',
    'branch', 'clark', "queen anne's", 'fairbanks north star borough',
    'yoakum', 'wallowa', 'maricao', 'hood river', 'issaquena', "o'brien",
    'catawba', 'preston', 'sully', 'guthrie', 'pettis', 'iroquois',
    'barnstable', 'orangeburg', 'boulder', 'jefferson davis parish', 'durham',
    'denali borough', 'ulster', 'latimer', 'spalding', 'lehigh', 'schleicher',
    'lagrange', 'kay', 'columbia', 'ford', 'floyd', 'wyandotte', 'grand isle',
    'lincoln', 'gosper', 'san mateo', 'coffee', 'vega baja', 'austin',
    'de baca', 'owyhee', 'blaine', 'lauderdale', 'newton', 'elliott', 'horry',
    'fairfax', 'mountrail', 'concho', 'acadia parish', 'san germán',
    'missaukee', 'hennepin', 'latah', 'mcclain', 'coös', 'travis',
    'montmorency', 'big horn', 'tunica', 'norfolk', 'alger', 'hyde', 'suffolk',
    'otero', 'neshoba', 'juncos', 'reeves', 'adair', 'north slope borough',
    'burleson', 'tipton', 'aleutians west census area', 'schoolcraft',
    'powhatan', 'highlands', 'racine', 'owsley', 'smyth', 'stephens',
    'portage', 'aibonito', 'canadian', 'mellette', 'monona', 'traill',
    'watauga', 'dewey', 'lewis and clark', 'lamoille', 'duplin', 'wood',
    'crowley', 'calumet', 'pittsylvania', 'st. clair', 'lassen', 'benzie',
    'columbus', 'gregory', 'kings', 'cowley', 'pierce', 'pine', 'green lake',
    'teller', 'williamson', 'missoula', 'alamosa', 'aurora', 'thomas',
    'pennington', 'lemhi', 'del norte', 'tucker', 'val verde', 'colorado',
    'saline', 'pottawattamie', 'stanton', 'gage', 'jeff davis', 'maury',
    'kemper', 'garvin', 'philadelphia', 'montezuma', 'pleasants',
    'northampton', 'gila', 'grand', 'trego', 'platte', 'st. lawrence', 'nash',
    'pima', 'rawlins', 'wharton', 'ziebach', 'inyo', 'pickaway', 'bennington',
    'new haven', 'snyder', 'hinds', 'cattaraugus', 'iredell', 'webb',
    'keya paha', 'divide', 'cameron', 'dickenson', 'walsh', 'huntington',
    'cameron parish', 'king william', 'oregon', 'rogers', 'yakima', 'hardee',
    'crosby', 'tyrrell', 'plumas', 'pickett', 'morehouse parish',
    'bristol bay borough', 'scott', 'muscatine', 'trousdale', 'upton', 'burt',
    'kidder', 'miller', 'doddridge', 'menard', 'cavalier', 'st. martin parish',
    'bracken', 'freeborn', 'curry', 'wyandot', 'washita', 'judith basin',
    'dunn', 'peach', 'mccracken', 'story', 'cheshire', 'halifax', 'sitka',
    'alameda', 'gilpin', 'republic', 'esmeralda', 'buffalo', 'ripley',
    'hot springs', 'yuma', 'rusk', 'laporte', 'brantley', 'weakley', 'bulloch',
    'culebra', 'st. mary parish', 'luquillo', 'falls', 'honolulu', 'coamo',
    'trinity', 'yavapai', 'citrus', 'seward', 'bethel census area', 'sterling',
    'tulare', 'noble', 'cidra', 'brooks', 'dooly', 'owen', 'greenwood',
    'rosebud', 'guernsey', 'bowman', 'watonwan', 'cape may', 'imperial',
    'jarvis island', 'colonial heights', 'angelina', 'shelby', 'greer',
    'coffey', 'candler', 'le flore', 'woodbury', 'waseca', 'cass', 'coles',
    'montrose', 'ness', 'grand traverse', 'staten island', 'simpson',
    'berrien', 'llano', 'hockley', 'lyon', 'dewitt', 'payette', 'shawnee',
    'mecosta', 'navajo', 'san diego', 'routt', 'mora', 'sharp', 'rose atoll',
    'cherry', 'bell', 'virginia beach', 'patillas', 'kleberg', 'anoka',
    'georgetown', 'costilla', 'mccone', 'kit carson', 'cooper', 'mono',
    'sequoyah', 'eddy', 'the bronx', 'clare', 'lasalle', 'swisher', 'scurry',
    'jefferson parish', 'gibson', 'overton', 'san lorenzo', 'edgecombe',
    'catron', 'baca', 'radford', 'isabela', 'graham', 'indiana',
    'assumption parish', 'wake island', 'new york', 'tioga', 'isabella',
    'whitman', 'troup', 'furnas', 'camden', 'white pine', 'saunders',
    'chippewa', 'clay', 'hancock', 'hansford', 'nassau', 'garfield',
    'josephine', 'refugio', 'buena vista', 'hudson', 'finney', 'bledsoe',
    'grundy', 'pecos', 'price', 'waynesboro', 'windsor', 'mayes', 'humboldt',
    'dent', 'willacy', 'webster parish', 'amherst', 'colfax', 'wadena',
    'lac qui parle', 'lapeer', 'ohio', 'jackson parish', 'panola', 'yabucoa',
    'dillingham census area', 'brazoria', 'johnston atoll', 'iron', 'cabell',
    'collingsworth', 'abbeville', 'benson', 'mcdonald', 'valley', 'kendall',
    'lampasas', 'glasscock', 'muskegon', 'umatilla', 'bossier parish',
    'treutlen', 'leon', 'gurabo', 'henrico', 'conecuh', 'stonewall',
    'navassa island', 'st. joseph', 'colbert', 'knox', 'lancaster', 'waukesha',
    'fillmore', 'clarke', 'st. james parish', 'maricopa', 'wright',
    'oktibbeha', 'beltrami', 'archer', 'butler', 'litchfield', 'pipestone',
    'stutsman', 'treasure', 'crane', 'hocking', 'placer', 'allen parish',
    'archuleta', "manu'a district", 'denton', 'rockcastle', 'red river',
    'norman', 'oakland', 'harding', 'passaic', 'rapides parish',
    'king and queen', 'greeley', 'jones', 'trujillo alto', 'washington parish',
    'bennett', 'nodaway', 'dare', 'williams', 'accomack', 'steuben', 'reno',
    'albany', 'hinsdale', 'harmon', 'starke', 'perquimans', 'utuado', 'chowan',
    'fulton', 'eagle', 'swift', 'lafayette parish', 'boyd', 'mchenry',
    'musselshell', 'hampton', 'pemiscot', 'neosho', 'beaver', 'coos',
    'outagamie', 'petroleum', 'sierra', 'susquehanna', 'breathitt', 'jerome',
    'sublette', 'claiborne parish', 'yauco', 'charlotte', 'door', 'houston',
    'daviess', 'nelson', 'taney', 'calhoun', 'apache', 'larue', 'gallia',
    'kennebec', 'haywood', 'palo alto', 'southampton', 'eau claire',
    'kootenai', 'stone', 'cannon', 'marin', 'ellis', 'brewster', 'hays',
    'quitman', 'rio grande', 'kenedy', 'red river parish', 'lumpkin',
    'kenai peninsula borough', 'baraga', 'yazoo', 'corson', 'fleming',
    'billings', 'modoc', 'morrill', 'saint croix', 'searcy', 'lanier',
    'marion', 'clear creek', 'new hanover', 'alexander', 'nantucket',
    'sherman', 'becker', 'contra costa', 'breckinridge', 'jasper', 'allamakee',
    'whiteside', 'champaign', 'manhattan', 'upshur', 'sabine', 'cuyahoga',
    'sonoma', 'zavala', 'chickasaw', 'fluvanna', 'king george', 'auglaize',
    'kenosha', 'yadkin', 'okfuskee', 'buckingham', 'coosa', 'jayuya', 'glenn',
    'dixon', 'currituck', 'stevens', 'moore', 'barbour', 'prentiss', 'bates',
    'conejos', 'muhlenberg', 'desha', 'cabarrus', 'santa barbara', 'el dorado',
    'conway', 'stafford', 'brevard', 'sharkey', 'st. lucie', 'medina', 'bland',
    'pawnee', 'okanogan', 'richmond', 'hutchinson', 'villalba', 'st. francois',
    'childress', 'hartford', 'erie', 'bleckley', 'harvey', 'le sueur', 'wirt',
    'mifflin', 'richland', 'huntingdon', 'anderson', 'kent', 'wicomico',
    'ravalli', 'gooding', 'loudoun', 'tuscola', 'kittitas', 'eastland',
    'hatillo', 'sequatchie', 'san patricio', 'uvalde', 'venango', 'bourbon',
    'surry', 'tom green', 'cleveland', 'prowers', 'barry', 'saint john',
    'fredericksburg', 'haakon', 'chase', 'towner', 'banner', 'crow wing',
    'stearns', 'walworth', 'coconino', 'alachua', 'sanpete', 'morrow',
    'allendale', 'harnett', 'tolland', 'asotin', 'santa isabel', 'osage',
    'grand forks', 'hart', 'thurston', 'scotland', 'kosciusko', 'las marías',
    'chelan', 'franklin', 'midland', 'arenac', 'franklin parish', 'wexford',
    'davison', 'weston', 'mcnairy', 'dubois', 'twin falls', 'gwinnett',
    'crook', 'red lake', 'westmoreland', 'schley', 'hernando', 'toa baja',
    'san bernardino', 'colusa', 'daniels', 'rabun', 'rio blanco', 'niagara',
    'davie', 'gillespie', 'chatham', 'manatí', 'weber', 'gilmer', 'aiken',
    'toombs', 'northwest arctic borough', 'nevada', 'entity', 'loíza',
    'yellow medicine', 'jefferson', 'dickinson', 'frio', 'guayanilla', 'ida',
    'storey', 'chilton', 'russell', 'potter', 'peoria', 'la crosse',
    'rio arriba', 'vance', 'garden', 'carbon', 'new london', 'delta',
    'doniphan', 'lake and peninsula borough', 'bladen', 'jim wells',
    'livingston', 'caroline', 'mahoning', 'oceana', 'payne', 'codington',
    'suwannee', 'hoonah–angoon census area', 'lucas', 'carteret', 'windham',
    'forest', 'yakutat', 'avoyelles parish', 'brule', 'knott', 'fallon', 'nye',
    'gilchrist', 'jim hogg', 'traverse', 'kittson', 'cibola', 'rock island',
    'sarasota', 'scioto', 'onslow', 'albemarle', 'forsyth', 'jessamine',
    'idaho', 'jackson', 'elbert', 'richardson', 'craig', 'brown', 'major',
    'goodhue', 'haines borough', 'obion', 'dekalb', 'dawson',
    'west feliciana parish', 'richland parish', 'monongalia', 'victoria',
    'greenbrier', 'emmet', 'alleghany', 'somerset', 'custer', 'vermillion',
    'sandusky', 'centre', 'beaufort', 'oldham', 'baker island', 'hettinger',
    'loup', 'levy', 'woods', 'lake', 'coahoma', 'caguas', 'tazewell',
    'doña ana', 'muscogee', 'piscataquis', 'bowie', 'piatt', 'maunabo',
    'rowan', 'essex', 'loving', 'carolina', 'dillon', 'nome census area',
    'hudspeth', 'guadalupe', 'webster', 'davis', 'sagadahoc',
    'caldwell parish', 'petersburg', 'lafayette', 'alpena', 'carver', 'murray',
    'norton', 'kane', 'ross', 'pueblo', 'mackinac', 'juana díaz', 'hall',
    'otoe', 'cimarron', 'dorchester', 'naguabo', 'dorado', 'unicoi', 'starr',
    'chariton', 'bernalillo', 'morgan', 'grady', 'atlantic', 'grainger',
    'atchison', 'spink', 'solano', 'red willow', 'vilas', 'tippah', 'arthur',
    'ogemaw', 'rincón', 'cochran', 'whitley', 'antelope', 'graves',
    'nez perce', 'carson city', 'broward', 'lexington', 'hawaii', 'rankin',
    'niobrara', 'middlesex', 'mclean', 'leelanau', 'calaveras', 'hertford',
    'pender', 'randall', 'monroe', 'dundy', 'andrews', 'genesee', 'randolph',
    'monterey', 'chugach census area', 'marquette', 'orocovis', 'parke',
    'culberson', 'kingman', 'catahoula parish', 'bremer', 'drew',
    'mecklenburg', 'prince edward', 'wilson', 'sevier', 'caddo parish',
    'eaton', 'dauphin', 'rutland', 'st. tammany parish', 'andrew',
    'san augustine', 'aroostook', 'gaston', 'kerr', 'sutter', 'oglala lakota',
    'caldwell', 'duval', 'utah', 'grays harbor', 'isanti', 'habersham',
    'mason', 'chautauqua', 'sumter', 'glynn', 'quebradillas', 'gadsden',
    'montcalm', 'howell', 'jay', 'arroyo', 'adjuntas', 'miner', 'edmonson',
    'kanabec', 'nance', 'barton', 'young', 'hood', 'amelia', 'kershaw',
    'rooks', 'cook', 'mesa', 'augusta', 'braxton', 'oxford', 'chaffee',
    'oconee', 'aransas', 'ashtabula', 'presidio', 'box elder', 'columbiana',
    'santa clara', 'coshocton', 'anson', 'kimball', 'cambria', 'sacramento',
    'milwaukee', 'st. landry parish', 'mcleod', 'nowata', 'pickens', 'gregg',
    'bottineau', 'multnomah', 'sioux', 'stanley', 'runnels', 'dickey',
    'pocahontas', 'fannin', 'pittsburg', 'real', 'waldo', 'gray',
    'westchester', 'ware', 'dyer', 'winston', 'davidson', 'foster',
    'palmyra atoll', 'copiah', 'livingston parish', 'dunklin', 'gordon',
    'cayey', 'moody', 'jerauld', 'oswego', 'stark', 'stokes', 'crenshaw',
    'bullitt', "prince george's", 'nueces', 'clinch', 'granville', 'dane',
    'defiance', 'piute', 'power', 'marinette', 'manassas', 'ogle', 'mccook',
    'talbot', 'faribault', 'clearfield', 'texas', 'navarro', 'sangamon',
    'white', 'estill', 'burleigh', 'baxter', 'page', 'cloud', 'tooele', 'day',
    'fergus', 'skagit', 'gallatin', 'vigo', 'baldwin', 'ponce', 'blackford',
    'colquitt', 'nicollet', 'tallahatchie', 'evans', 'motley', 'big stone',
    'shasta', 'gates', 'tulsa', 'emporia', 'lenoir', 'clallam', 'jersey',
    'walton', 'george', 'sheridan', 'huron', 'callaway', 'loudon', 'mccurtain',
    'valencia', 'desoto parish', 'onondaga', 'chemung', 'iosco', 'mitchell',
    'vanderburgh', 'kearney', 'kingman reef', 'ouachita parish', 'chisago',
    'appling', 'chaves', 'falls church', 'yellowstone', 'bon homme',
    'harrisonburg', 'ransom', 'washtenaw', 'nuckolls', 'daggett',
    'san francisco', 'orleans parish', 'lake of the woods', 'winkler',
    'aguadilla', 'bibb', 'green', 'kitsap', 'kodiak island borough', 'merrick',
    'fentress', 'miami', 'lamb', 'matagorda', 'natrona', 'ozark', 'rockland',
    'bolivar', 'fall river', 'leake', 'oliver', 'baltimore', 'elkhart',
    'union parish', 'emmons', 'greenville', 'bee', 'lenawee', 'crawford',
    'shiawassee', 'monmouth', 'sheboygan', 'faulkner', 'butte',
    'prince of wales–hyder census area', 'torrance', 'kinney', 'maui',
    'manitowoc', 'harris', 'roanoke', 'cottonwood', 'labette', 'saipan',
    'garrard', 'cape girardeau', 'dukes', 'trigg', 'hampden', 'androscoggin',
    'cecil', 'thayer', 'glacier', 'live oak', 'benton', 'jennings',
    'silver bow', 'pointe coupee parish', 'lander', 'charlottesville',
    'st. louis', 'shoshone', 'cataño', 'arapahoe', 'new madrid', 'mingo',
    'pushmataha', 'terry', 'powell', 'cassia', 'tehama', 'belmont',
    'clackamas', 'lafourche parish', 'martinsville', 'strafford', 'poquoson',
    'carson', 'vermilion', 'edwards', 'alcona', 'wythe', 'benewah', 'cottle',
    'allegan', 'klickitat', 'lonoke', 'ozaukee', 'langlade', 'lipscomb',
    'san joaquin', 'taliaferro', 'las animas', 'trempealeau', 'milam',
    'hidalgo', 'wyoming', 'moca', 'donley', 'glascock', 'deer lodge',
    'alamance', 'clinton', 'lynchburg', 'mcmullen', 'new kent', 'wagoner',
    'atascosa', 'taos', 'beauregard parish', 'hoke', 'worth', 'bartow',
    'st. helena parish', 'lawrence', 'luzerne', 'throckmorton', 'gratiot',
    'guayama', 'union', 'blue earth', 'marlboro', 'pembina', 'grenada',
    'fayette', 'woodward', 'meagher', 'moultrie', 'bradford', 'otsego', 'hunt',
    'lea', 'menifee', 'rains', 'st. charles parish', 'sweetwater', 'limestone',
    'northern islands municipality', 'harford', 'roosevelt', 'ochiltree',
    'faulk', 'keokuk', 'yolo', 'stanly', 'chicot', 'boone', 'dinwiddie',
    'little river', 'kenton', 'rockdale', 'addison', 'howard', 'licking',
    'camp', 'swain', 'summers', 'broome', 'vieques', 'gilliam', 'coke',
    'ontario', 'lincoln parish', 'bureau', 'caddo', 'kaufman', 'marengo',
    'juniata', 'lane', 'sac', 'mckenzie', 'staunton', 'dallam', 'tillamook',
    'wibaux', 'cumberland', 'hickory', 'waupaca', 'st. john the baptist parish',
    'penobscot', 'wabaunsee', 'winnebago', 'yuba', 'dutchess', 'tippecanoe',
    'preble', 'cayuga', 'bamberg', 'sandoval', 'naranjito', 'nemaha',
    'berkeley', 'sibley', 'wise', 'carter', 'flagler', 'jo daviess', 'denver',
    'covington', 'sauk', 'madera', 'luna', 'pend oreille', 'arlington',
    'dakota', 'cotton', 'redwood', 'liberty', 'pratt', 'macomb', 'creek',
    'hill', 'lynn', 'williamsburg', 'calloway', 'charles city', 'love',
    'tallapoosa', 'clatsop', 'casey', 'upson', 'chesterfield', 'swains island',
    'spotsylvania', 'guaynabo', 'pinellas', 'mower', 'briscoe', 'uinta',
    'wayne', 'charlton', 'prince william', 'grimes', 'la paz', 'fond du lac',
    'berks', 'garza', 'mille lacs', 'vinton', 'colleton', 'comanche', 'porter',
    'hanover', 'delaware', 'kalkaska', 'peñuelas', 'lewis', 'mclennan',
    'yankton', 'alpine', 'madison', 'jack', 'shannon', 'telfair', 'des moines',
    'sanborn', 'san benito', 'winn parish', 'rockbridge', 'collin',
    'matanuska-susitna borough', 'wells', 'orleans', 'montgomery', 'charlevoix',
    'ashe', 'calvert', 'ralls', 'banks', 'rock', 'camuy', 'leavenworth',
    'winchester', 'mcminn', 'gem', 'perry', 'ontonagon', 'st. charles',
    'sumner', 'newport news', 'burlington', 'hillsdale', 'autauga',
    'mendocino', 'beckham', 'meriwether', 'allegany', 'natchitoches parish',
    'ray', 'tuolumne', 'clayton', 'riley', 'grayson', 'transylvania',
    'plymouth', 'ramsey', 'irwin', 'metcalfe', 'athens', 'bedford', 'kewaunee',
    'posey', 'tattnall', 'ferry', 'reagan', 'hand', 'gentry', 'cooke',
    'rappahannock', 'bucks', 'warrick', 'giles', 'larimer', 'christian',
    'buncombe', 'wake', 'coweta', 'raleigh', 'barceloneta', 'bacon', 'forrest',
    'wolfe', 'ballard', 'yell', 'bonner', 'wetzel', 'chouteau', 'linn',
    'cuming', 'washburn', 'valdez–cordova census area', 'deschutes',
    'prince george', 'winona', 'polk', 'queens', 'douglas', 'harlan', 'logan',
    'martin', 'alfalfa', 'tishomingo', 'castro', 'craven', 'bergen', 'collier',
    'new castle', 'saratoga', 'walla walla', 'palo pinto', 'lubbock',
    'hickman', 'tuscaloosa', 'san juan', 'harrison', 'ventura', 'alcorn',
    'osborne', 'elmore', 'bertie', 'bosque', 'dearborn', 'west carroll parish',
    'kandiyohi', 'toa alta', 'miami-dade', 'burke', 'tuscarawas', 'sawyer',
    'blanco', 'spokane', 'palm beach', 'east carroll parish', 'boise',
    'gunnison', 'terrebonne parish', 'ascension parish', 'broadwater', 'blair',
    'osceola', 'schuylkill', 'kiowa', 'wilbarger', 'pacific', 'howland island',
    'pepin', 'gogebic', 'leflore', 'bartholomew', 'hot spring', 'rota',
    'madison parish', 'newport', 'oscoda', 'cleburne', 'barber', 'hanson',
    'morrison', 'adams', 'rensselaer', 'trumbull', 'hartley', 'las piedras',
    'tyler', 'henry', 'hillsborough', 'sunflower', 'hardy', 'chittenden',
    'schuyler', 'fajardo', 'gloucester', 'van buren', 'campbell',
    'chattahoochee', 'cheboygan', 'indian river', 'salinas', 'freestone',
    'shackelford', 'volusia', 'brunswick', 'grafton', 'pulaski', 'brazos',
    'long', 'person', 'evangeline parish', 'shawano', 'chattooga',
    'kingfisher', 'lamar', 'río grande', 'bonneville', 'hormigueros',
    'hendricks', 'east baton rouge parish', 'juab', 'isle of wight', 'ciales',
    'beaverhead', 'walker', 'bannock', 'mckean', 'fairfield', 'canóvanas',
    'west baton rouge parish', 'pondera', 'mercer', 'clearwater', 'tarrant',
    'eastern district', 'newaygo', 'lycoming', 'renville', 'tate', 'smith',
    'lyman', 'camas', 'dickson', 'choctaw', 'charleston', 'dolores', 'titus',
    'anne arundel', 'geauga', 'santa fe', 'yates', 'blount', 'san jacinto',
    'la salle', 'rockingham', 'east feliciana parish', 'itasca', 'menominee',
    'boyle', 'jewell', 'roseau', 'golden valley', 'oklahoma', 'sampson',
    'la plata', 'prairie', 'aguada', 'providence', 'foard', 'saluda',
    'aleutians east borough', 'cole', 'caswell', 'wheeler', 'caribou',
    'minidoka', 'iberville parish', 'ketchikan gateway borough', 'bayfield',
    'coleman', 'los alamos', 'petersburg borough', 'barnes', 'okmulgee',
    'luce', 'spartanburg', 'concordia parish', 'gladwin', 'newberry',
    'burnett', 'gove', 'granite', 'grant', 'paulding', 'brooke', 'manistee',
    'todd', 'san sebastián', 'okeechobee', 'rush', 'mccreary', 'attala',
    'manassas park', 'anchorage municipality', 'pinal', 'bayamón',
    'northumberland', 'kankakee', 'dickens', 'sussex', 'van zandt', 'bryan',
    'teton', 'san saba', 'hamilton', 'florence', 'saginaw', 'desoto',
    'yamhill', 'lebanon', 'mcduffie', 'iowa', 'clarion', 'frederick', 'wasatch',
    'hubbard', 'letcher', 'ward', 'montour', 'galax', 'vega alta', 'talladega',
    'oneida', 'geneva', 'laurel', 'dougherty', 'armstrong', 'griggs', 'toole',
    'sabana grande', 'lavaca', 'reynolds', 'herkimer', 'koochiching', 'juneau',
    'pottawatomie', 'morovis', 'cocke', 'elk', 'socorro', 'berkshire',
    'johnson', 'catoosa', 'bastrop', 'chester', 'cedar', 'greenlee', 'harney',
    'chenango', 'sherburne', 'hooker', 'wabasha', 'aitkin', 'independence',
    'comal', 'grant parish', 'kimble', 'hawkins', 'lowndes', 'ottawa', 'tift',
    'st. croix', 'presque isle', 'ocean', 'poweshiek', 'st. johns', 'kern',
    'bienville parish', 'mccormick', 'hodgeman', 'will', 'pike', 'massac',
    'hendry', 'greene', 'wapello', 'vernon', 'ingham', 'ashland', 'flathead',
    'garland', 'moffat', 'slope', 'comerío', 'goochland', 'napa', 'carlton',
    'mathews', 'barron', 'montague', 'maries', 'washington', 'lajas',
    'woodford', 'pershing', 'stanislaus', 'lamoure', 'boundary', 'uintah',
    'highland', 'meeker', 'santa cruz', 'jefferson davis', 'ringgold', 'cross',
    'fort bend', 'gulf', 'screven', 'minnehaha', 'laclede', 'jenkins',
    'florida', 'kalawao', 'mariposa', 'mcdonough', 'mcintosh', 'avery',
    'belknap', 'barranquitas', 'macon', 'scotts bluff', 'edmunds',
    'crittenden', 'edgar', 'marshall', 'quay', 'whitfield', 'dade', 'decatur',
    'cheatham', 'san miguel', 'mckinley', 'parker', 'moniteau', 'otter tail',
    'robeson', 'seminole', 'alexandria', 'brookings', 'erath', 'duchesne',
    'vermilion parish', 'clarendon', 'ben hill', 'hughes', 'poinsett', 'pope',
    'ouray', 'portsmouth', 'muskingum', 'gaines', 'charles mix', 'sullivan',
    'holt', 'karnes', 'van wert', 'baylor', 'fresno', 'los angeles', 'noxubee',
    'york', 'churchill', 'iberia parish', 'twiggs', 'saint thomas', 'stewart',
    'bond', 'nottoway', 'kearny', 'schenectady', 'bent', 'pamlico', 'pasco',
    'mayagüez', 'broomfield', 'woodruff', 'botetourt', 'butts', 'emanuel',
    'bath', 'arecibo', 'stephenson', 'terrell', 'deuel', 'baker', 'etowah',
    'salt lake', 'amite', 'waushara', 'bollinger', 'maverick', 'kauai',
    'wahkiakum', 'hempstead', 'island', 'eureka', 'haralson', 'harper',
    'greenup', 'beadle', 'charles', 'vernon parish', 'appomattox', 'oconto',
    'guánica'
]


In [None]:
# us states

us_states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
    "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
    "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
    "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi",
    "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey",
    "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio",
    "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina",
    "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia",
    "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

us_states = [x.lower() for x in us_states]
print(us_states)


In [None]:
# aviation_aircraft_model

aircraft_models = [
    "Antonov An-2", "Antonov An-24RV","Antonov An-26","Antonov An-70", "Boeing CT-43"
]

In [None]:
# soccer_football_player

soccer_football_players = [
    "Wissam Ben Yedder", "Presnel Kimpembe", "Luis Alberto", "Jérôme Boateng",
    "Dominik Szoboszlai", "Alvaro Morata", "Ferran Torres", "Ever Banega",
    "Nicolo Barella", "Robin Gosens", "Antoine Griezmann", "Ederson",
    "Marcos Llorente", "Eduardo Camavinga", "Jesus Navas", "Thomas Partey",
    "Dries Mertens", "Eden Hazard", "Richarlison", "Philippe Coutinho",
    "Dominic Calvert-Lewin", "Frenkie de Jong", "Raul Jimenez", "Riyad Mahrez",
    "Achraf Hakimi", "Leroy Sané", "Martin Ödegaard", "Georginio Wijnaldum",
    "Houssem Aouar", "Luka Modric", "Bernardo Silva", "Matthijs de Ligt",
    "Marcel Sabitzer", "Keylor Navas", "Federico Valverde", "Sergio Agüero",
    "Memphis Depay", "Diogo Jota", "Kalidou Koulibaly", "Josip Ilicic",
    "Hakim Ziyech", "Jack Grealish", "Luis Suarez", "Duvan Zapata", "Casemiro",
    "N'Golo Kanté", "Thiago Silva", "Lucas Ocampos", "Dayot Upamecano",
    "Fabinho", "Kai Havertz", "Marquinhos", "Marc-André ter Stegen",
    "Kingsley Coman", "Thibaut Courtois", "Ansu Fati", "Lautaro Martinez",
    "Roberto Firmino", "Toni Kroos", "Marcus Rashford", "Leon Goretzka",
    "Paulo Dybala", "Angel Di Maria", "João Felix",
    "Pierre-Emerick Aubameyang", "Jamie Vardy", "Andrew Robertson",
    "Jordan Henderson", "Alejandro «Papu» Gomez", "David Alaba", "Timo Werner",
    "Jadon Sancho", "Jan Oblak", "Raheem Sterling", "Alisson",
    "Zlatan Ibrahimovic", "Ciro Immobile", "Alphonso Davies", "Heung-min Son",
    "Bruno Fernandes", "Harry Kane", "Romelu Lukaku", "Trent Alexander-Arnold",
    "Thiago Alcantara", "Serge Gnabry", "Manuel Neuer", "Thomas Müller",
    "Sergio Ramos", "Karim Benzema", "Joshua Kimmich", "Virgil van Dijk",
    "Neymar", "Mohamed Salah", "Kylian Mbappé", "Erling Braut Haaland",
    "Sadio Mané", "Kevin De Bruyne", "Cristiano Ronaldo", "Lionel Messi",
    "Robert Lewandowski"
]
soccer_football_players = [x.lower() for x in soccer_football_players]
print(soccer_football_players)

# RegEx LF

In [None]:
unlabeled_data_df_with_type =  unlabeled_data_df[unlabeled_data_df["semanticType"] == "aviation.aircraft_model"]

In [None]:
unlabeled_data_df_with_type[unlabeled_data_df_with_type["dataset_id"] == "102682_73251-16.csv+column_0"].iloc[0]

In [None]:
from snorkel.labeling import labeling_function
regexes = ["Boeing\\s[A-Za-z0-9]+", "Antonov\\s[A-Za-z0-9]+"]
### table colum loader of raw data
def load_tablecolumn(dataset_id:str):
    table_id = dataset_id.split("+")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    df_column = pd.read_csv(join(os.environ["TURL_DIR"],table_id), usecols=[int(column_id)])
    return df_column.iloc[:,0].values.tolist()

@labeling_function()
def Reg_ex_search(x):
    current_col = load_tablecolumn(x["dataset_id"])
    min_num = int(0.2 * len(current_col))
    if min_num <= 1: min_num = 2
    number_of_matches = 0
    for regex in regexes:
        r = re.compile(regex)
        matchings = list(filter(r.match,current_col))
        number_of_matches += len(matchings)
    print(number_of_matches, min_num)
    if number_of_matches >= min_num:
        LABEL = label_enc.transform(["aviation.aircraft_model"])[0]
        return LABEL
    return -1

@labeling_function()
def reg_ex_search(x):
    if os.environ["CORPUS"] == "public_bi":
        current_col = load_tablecolumn_public_bi(x["dataset_id"])
    else:
        current_col = load_tablecolumn(x["dataset_id"])
    min_num = int(PERCENTAGE_OF_ELEMENTS_IN_COL * len(current_col))
    if min_num <= 1: min_num = 2
    number_of_matches = 0
    for regex in regexes:
        r = re.compile(regex, re.IGNORECASE)
        matchings = list(filter(r.match, [str(x) for x in current_col]))
        number_of_matches += len(matchings)
    if number_of_matches >= min_num:
        LABEL = label_enc.transform([sem_type])[0]
        return LABEL
    return -1

In [None]:
Reg_ex_search(unlabeled_data_df_with_type[unlabeled_data_df_with_type["dataset_id"] == "102682_73251-16.csv+column_0"].iloc[0])

In [None]:
import re

regexe = ["^\W*(\w+\\b\W*){6,}$", lines[0][11:-2]]
for regex in regexe:
    r = re.compile(regex)
    newlist = list(filter(r.match, ["Tom Brady", "Drew Brees", "kjasndk kasndanda dsd sdsdsd dfdf sadasd", "sf sdsd"]))
    print(regex, newlist)

In [None]:
re.match("^\W*(\w+\\b\W*){6,}$", "tsesdsd sdsd sd asd ad", re.IGNORECASE)

In [None]:
"^\W*(\w+\\b\W*){6,}$"re.match(, "tsesdsd sdsd sd asd ad s", re.IGNORECASE)

In [None]:
with open('./regex_elements_in_col/params/public_bi/description.txt') as f:
    lines = f.readlines()

In [None]:
lines[0][11:-2]

In [None]:
with open('./regex_elements_in_col/params/public_bi/name.txt') as f:
    lines = f.readlines()
lines

# Combine different results form multiple runs

In [None]:
import os
from os.path import join
from dotenv import load_dotenv
load_dotenv(override=True)
import pandas as pd
import json
from sklearn.metrics import classification_report
import numpy as np

#labeled_data_size = 5
distance_threshold = 0.01
for labeled_data_size in [1,2,3,4,5]:
    path = join(
        os.environ["WORKING_DIR"], "emb_clus", "without_knn", "out",
        f"public_bi_clustering_n_classify_results_gen_train_data_{distance_threshold}_{labeled_data_size}_absolute_20.0"
    )
    scores = {
        "f1-scores_macro": [],
        "precisions_macro":[],
        "recalls_macro":[],
        "supports_macro": [],
        "f1-scores_weighted": [],
        "precisions_weighted": [],
        "recalls_weighted": [],
        "supports_weighted": []
    }


    for random_state in [1,2,3,4,5]:
        df_current = pd.read_csv(path+f"_{random_state}.csv")
        df_current = df_current[(df_current["already_labeled"] == False) & (df_current["predicted_type"] != "None")]
        current_class_report = classification_report(df_current["semanticType"],df_current["predicted_type"], output_dict=True)
        for metric in ["macro","weighted"]:
            scores[f"f1-scores_{metric}"].append(current_class_report[f"{metric} avg"]["f1-score"])
            scores[f"precisions_{metric}"].append(current_class_report[f"{metric} avg"]["precision"])
            scores[f"recalls_{metric}"].append(current_class_report[f"{metric} avg"]["recall"])
            scores[f"supports_{metric}"].append(current_class_report[f"{metric} avg"]["support"])

    df_scores = pd.DataFrame(
        np.array([
            scores["f1-scores_macro"], scores["precisions_macro"],
            scores["recalls_macro"], scores["supports_macro"],
            scores["f1-scores_weighted"], scores["precisions_weighted"],
            scores["recalls_weighted"], scores["supports_weighted"]
        ]), index=scores.keys())
    df_scores["mean"] = df_scores.mean(axis=1)
    df_scores["std"] = df_scores.std(axis=1)
    df_scores["var"] = df_scores.var(axis=1)

    df_scores.to_csv(path+"_mean.csv")


In [None]:
import os
from os.path import join
from dotenv import load_dotenv
import pandas as pd
load_dotenv(override=True)
from sql_metadata import Parser

def load_tablecolum_header_public_bi(dataset_id: str):
    table_id = dataset_id.split("+")[0]
    folder_id = table_id.split("_")[0]
    column_id = dataset_id.split("+")[1].split("_")[1]
    sql = open(join(os.environ["PUBLIC_BI_BENCHMARK"], folder_id, "tables", f"{table_id}.table.sql"), "r")
    header = Parser(sql.read()).columns[int(column_id)]
    return header 

In [None]:
from regex_elements_in_col.run_regex_elements_in_col import load_tablecolumn_public_bi

In [None]:
current_col = load_tablecolumn_public_bi("HashTags_1+column_58")

In [None]:
current_col

In [None]:
min_num = 0.2 * len(current_col)
predefined_elements = ["m","f","u"]
if min_num <= 1: min_num = 2
num_of_elements_in_col = len([
    x for x in current_col
    if str(x).lower() in predefined_elements
])
print(num_of_elements_in_col, min_num)
if num_of_elements_in_col >= min_num:
    print(True)

In [None]:
def check_elements_in_col(x):
        if os.environ["CORPUS"] == "public_bi":
            current_col = load_tablecolumn_public_bi(x["dataset_id"])
        else:
            current_col = load_tablecolumn(x["dataset_id"])
        min_num = PERCENTAGE_OF_ELEMENTS_IN_COL * len(current_col)
        if min_num <= 1: min_num = 2
        num_of_elements_in_col = len([
            x for x in current_col
            if str(x).lower() in predefined_elements
        ])
        if num_of_elements_in_col >= min_num:
            LABEL = label_enc.transform([sem_type])[0]
            return LABEL
        return -1

In [None]:
import pandas as pd

df  = pd.read_csv("../data/public_bi/benchmark/MLB/MLB_2.csv", sep="|", header=None, usecols=[1])
len(df)

In [None]:
df[1]

In [None]:
%run regex_elements_in_col/run_regex_elements_in_col.py -c regex_elements_in_col/params/public_bi/description.txt --labeled_data_size 1 --absolute_number True --corpus public_bi --gen_train_data True --n_worker 1 --random_state 1

In [None]:
%run regex_elements_in_col/run_regex_elements_in_col.py -c regex_elements_in_col/params/public_bi/description.txt --labeled_data_size 1 --regexes {"^\W*(\w+\b\W*){6,}$"} --absolute_number True --corpus public_bi --gen_train_data True --n_worker 1 --random_state 1

### test combine LFs to solve huge bug

In [37]:
import pandas as pd
pd.set_option('display.max_rows', 2000)
corpus = "public_bi_num"
labeled_data_size = 1
random_state = 2

#df_CH = pd.read_csv("header_to_sem_type_sim/out/results/public_bi_header_to_sem_type_results_0.9_1_absolute_20.0_1.csv")
df_EmbClus = pd.read_csv(
    f"../emb_clus/without_knn/out/{corpus}_clustering_n_classify_results_gen_train_data_0.01_{labeled_data_size}_absolute_20.0_{random_state}_strings_GoogleUSEv3.csv")
df_EmbClus = df_EmbClus[df_EmbClus["already_labeled"] == False]

df_emd = pd.read_csv(
    f"../labeling_functions/numerics/normal_EMD/out/gen_train_data/{corpus}_gen_training_data_0.01_None_{labeled_data_size}_absolute_20.0_{random_state}.csv")

print(len(df_EmbClus))
print(len(df_emd))

# we should combine all dataset-id from all lf and use this as now index list for the differenet dataframes
print(len(df_EmbClus["dataset_id"].tolist()+df_emd["dataset_id"].tolist()))
len(list(set(df_EmbClus["dataset_id"].tolist()+df_emd["dataset_id"].tolist())))
new_index_order = list(
    set(df_EmbClus["dataset_id"].tolist()+df_emd["dataset_id"].tolist()))


1026
73
1099


In [38]:
df_EmbClus.head(3)

Unnamed: 0,table,column,dataset_id,already_labeled,semanticType,cluster_label,predicted_type
32,CityMaxCapita_1,column_1,CityMaxCapita_1+column_1,False,state,135,
33,CityMaxCapita_1,column_2,CityMaxCapita_1+column_2,False,city,121,
34,CityMaxCapita_1,column_10,CityMaxCapita_1+column_10,False,gender,33,


In [39]:
df_emd.head(3)

Unnamed: 0,table,column,dataset_id,predicted_semantic_type
0,MLB_43,column_41,MLB_43+column_41,X1B
1,MLB_59,column_37,MLB_59+column_37,SB
2,MLB_37,column_37,MLB_37+column_37,SO


In [40]:
df_emd = df_emd.set_index("dataset_id")
df_emd = df_emd.reindex(new_index_order)
df_emd.head(10)


Unnamed: 0_level_0,table,column,predicted_semantic_type
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MedPayment2_1+column_7,,,
MLB_33+column_52,,,
CommonGovernment_10+column_35,,,
Provider_7+column_15,,,
RealEstate2_3+column_20,,,
MLB_8+column_29,,,
Physicians_1+column_15,,,
Taxpayer_3+column_15,,,
MLB_7+column_25,MLB_7,column_25,X2B
MLB_45+column_58,,,


In [43]:
df_emd["predicted_semantic_type"] = df_emd["predicted_semantic_type"].fillna("None")
df_emd.head(10)

Unnamed: 0_level_0,table,column,predicted_semantic_type
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MedPayment2_1+column_7,,,
MLB_33+column_52,,,
CommonGovernment_10+column_35,,,
Provider_7+column_15,,,
RealEstate2_3+column_20,,,
MLB_8+column_29,,,
Physicians_1+column_15,,,
Taxpayer_3+column_15,,,
MLB_7+column_25,MLB_7,column_25,X2B
MLB_45+column_58,,,


In [26]:
df_EmbClus = df_EmbClus.set_index("dataset_id")
df_EmbClus = df_EmbClus.reindex(new_index_order)
df_EmbClus.head(3)

Unnamed: 0_level_0,table,column,already_labeled,semanticType,cluster_label,predicted_type
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MedPayment2_1+column_7,MedPayment2_1,column_7,False,description,57.0,description
MLB_33+column_52,MLB_33,column_52,False,teamName,177.0,
CommonGovernment_10+column_35,CommonGovernment_10,column_35,False,name,70.0,


In [18]:
df_EmbClus.head(10)

Unnamed: 0_level_0,table,column,already_labeled,semanticType,cluster_label,predicted_type
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MedPayment2_1+column_7,MedPayment2_1,column_7,False,description,57.0,description
MLB_33+column_52,MLB_33,column_52,False,teamName,177.0,
CommonGovernment_10+column_35,CommonGovernment_10,column_35,False,name,70.0,
Provider_7+column_15,Provider_7,column_15,False,name,36.0,
RealEstate2_3+column_20,RealEstate2_3,column_20,False,city,112.0,
MLB_8+column_29,MLB_8,column_29,False,team,2.0,team
Physicians_1+column_15,Physicians_1,column_15,False,name,36.0,
Taxpayer_3+column_15,Taxpayer_3,column_15,False,name,36.0,
MLB_7+column_25,,,,,,
MLB_45+column_58,MLB_45,column_58,False,teamName,153.0,


In [44]:
df_emd.head(1)

Unnamed: 0_level_0,table,column,predicted_semantic_type
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MedPayment2_1+column_7,,,


In [49]:
import os
import sys
from os.path import join
import pandas as pd
import json
from sklearn.preprocessing import LabelEncoder
import numpy as np
import configargparse
from dotenv import load_dotenv

load_dotenv(override=True)

valid_headers_path = join(os.environ["WORKING_DIR"], "data", "extract", "out", "valid_headers")


with open(join(valid_headers_path, f"{corpus}_{os.environ['TYPENAME']}.json")) as f:
        valid_headers = json.load(f)

In [50]:
valid_headers

{'CityMaxCapita_1': {'column_1': {'semanticType': 'state'},
  'column_2': {'semanticType': 'city'},
  'column_10': {'semanticType': 'gender'},
  'column_14': {'semanticType': 'language'},
  'column_20': {'semanticType': 'region'},
  'column_22': {'semanticType': 'country'},
  'column_23': {'semanticType': 'state'},
  'column_29': {'semanticType': 'name'},
  'column_30': {'semanticType': 'name'},
  'column_15': {'semanticType': 'latitude'},
  'column_18': {'semanticType': 'longitude'}},
 'CMSprovider_1': {'column_5': {'semanticType': 'code'},
  'column_6': {'semanticType': 'description'},
  'column_12': {'semanticType': 'code'},
  'column_13': {'semanticType': 'city'},
  'column_14': {'semanticType': 'country'},
  'column_15': {'semanticType': 'gender'},
  'column_16': {'semanticType': 'state'},
  'column_17': {'semanticType': 'address'},
  'column_18': {'semanticType': 'address'},
  'column_21': {'semanticType': 'service'},
  'column_22': {'semanticType': 'type'}},
 'CMSprovider_2': {'

# Test Transformer Model with Hugging Face API

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier("I love the way youre working")

In [None]:
classifier.model.config

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("deepset/gelectra-base-germanquad")

model = AutoModelForQuestionAnswering.from_pretrained("deepset/gelectra-base-germanquad")

In [None]:
electra = pipeline(task="question-answering", model=model, tokenizer=tokenizer)

question = r'''Wer hat Sommerhaus später geschrieben?'''
context = r'''Peter Stamm hat Agnes geschrieben. Judith Hermann ist die Autorin von Lettipark und Sommerhaus später. Thomas Bernhard hat das Kalkwerk geschrieben.'''

output = electra(context=context, question=question)
output

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")

In [None]:
text = 'Die Buchhandlung verkauft Kinderbücher'
tokens = tokenizer.tokenize(text)
tokens

In [None]:
vocab = tokenizer.vocab
len(vocab), vocab['##bücher']

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

In [None]:
out_tokenizer = tokenizer(text)
out_tokenizer

In [None]:
tokenizer.decode(out_tokenizer['input_ids'])

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = AutoModel.from_pretrained("bert-base-german-cased")


In [None]:
text = "Ich wiege 110 kg."
inputs = tokenizer(text, return_tensors="pt")
output = model(**inputs)
output[0].shape

In [None]:
tokenizer.tokenize(text)

In [None]:
output[0]

## fine tune pre-trained networks from transformer bib

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
for idx, layer in enumerate(model.bert.encoder.layer):
    for params in layer.parameters():
        params.requires_grad = False
        #print(idx, params.requires_grad)

In [None]:
params = [{
    "params": [p for n, p in model.named_parameters() if p.requires_grad]
}]


In [None]:
text = "It's impossible to watch this movie for longer than 5 minutes"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
outputs.logits.argmax(1).cpu().detach().numpy().tolist()

In [None]:
import torch.nn as nn

predictions = nn.Softmax(dim=1)
predictions = predictions(outputs["logits"])
predictions

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("IMDB_reviews/IMDB Dataset.csv")

X = df["review"].values
y = df["sentiment"].map(lambda x: 1 if x=='positive' else 0).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=11)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
X_train_tok = dict(tokenizer(X_train.tolist(), padding="max_length", truncation=True, max_length=50, return_tensors="pt"))
X_test_tok = dict(tokenizer(X_test.tolist(), padding="max_length", truncation=True, max_length=50, return_tensors="pt"))

X_train_tok["y"] = y_train
X_test_tok["y"] = y_test
print(X_train_tok["input_ids"].shape, X_test_tok["input_ids"].shape)
type(X_train_tok["input_ids"])


In [188]:
from torch.optim import Adam 
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset
s
optimizer = Adam(params, lr=5e-5)
loss_fn = CrossEntropyLoss()

train_dataloader = DataLoader(X_train_tok["input_ids"], batch_size=16)
valid_dataloader = DataLoader(X_test_tok, batch_size=16)

for epoch in range(1):

    for features in train_dataloader:
        #if i_batch == 0:
        print(features)

    # model.train()

    # tr_loss = 0.
    # tr_pred_list = []
    # tr_true_list = []

    # vl_loss = 0.
    # vl_pred_list = []
    # vl_true_list = []

    # logits = model(**X_train_tok).logits
    # print(logits)
    # loss = loss_fn(logits, y_train)
    # print(loss)
