# DBpedia Dataset 

Sourced from: https://huggingface.co/datasets/dbpedia_14

### Install Dependencies

### Imports

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import tensorflow_datasets as tfds

pd.set_option('display.max_colwidth', 0)

### Download Dataset

In [3]:
data_dir = "./"
# data_dir = "/content/drive/MyDrive/Research/DBpedia/"
ds, info = tfds.load('huggingface:dbpedia_14/dbpedia_14', split='train', with_info=True, data_dir=data_dir)
testds = tfds.load('huggingface:dbpedia_14/dbpedia_14', split='test', data_dir=data_dir)

FileNotFoundError: ignored

### Dataset Info

In [None]:
info

tfds.core.DatasetInfo(
    name='d_bpedia14',
    full_name='d_bpedia14/dbpedia_14/2.0.0',
    description="""
    The DBpedia ontology classification dataset is constructed by picking 14 non-overlapping classes
    from DBpedia 2014. They are listed in classes.txt. From each of thse 14 ontology classes, we
    randomly choose 40,000 training samples and 5,000 testing samples. Therefore, the total size
    of the training dataset is 560,000 and testing dataset 70,000.
    There are 3 columns in the dataset (same for train and test splits), corresponding to class index
    (1 to 14), title and content. The title and content are escaped using double quotes ("), and any
    internal double quote is escaped by 2 double quotes (""). There are no new lines in title or content.
    """,
    config_description="""
    DBpedia 2014 Ontology Classification Dataset.
    """,
    homepage='https://wiki.dbpedia.org/develop/datasets',
    data_path='/content/drive/MyDrive/Research/DBpedia/d_bpedia14

In [None]:
info.features

FeaturesDict({
    'content': tf.string,
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=14),
    'title': tf.string,
})

### Dataset Size

In [None]:
print("Train Dataset Size: ",info.splits['train'].num_examples)
print("Test Dataset Size: ",info.splits['test'].num_examples)

Train Dataset Size:  560000
Test Dataset Size:  70000


### Classes

In [None]:
print("Total Classes: ",info.features["label"].num_classes,"\n")
classes = info.features["label"].names
class_labels = [*range(14)]
for lbl,cls in zip(class_labels,classes): print(lbl,cls)

Total Classes:  14 

0 Company
1 EducationalInstitution
2 Artist
3 Athlete
4 OfficeHolder
5 MeanOfTransportation
6 Building
7 NaturalPlace
8 Village
9 Animal
10 Plant
11 Album
12 Film
13 WrittenWork


### Class Distribution

In [None]:
vals = np.unique(np.fromiter(ds.map(lambda x: x["label"]), float), return_counts=True)
print("Class distribution of Training Split")
for val, count in zip(*vals):
    print(int(val), count)

0 40000
1 40000
2 40000
3 40000
4 40000
5 40000
6 40000
7 40000
8 40000
9 40000
10 40000
11 40000
12 40000
13 40000


In [None]:
vals = np.unique(np.fromiter(testds.map(lambda x: x["label"]), float), return_counts=True)
print("Class distribution of Testing Split")
for val, count in zip(*vals):
    print(int(val), count)

Class distribution of Testing Split
0 5000
1 5000
2 5000
3 5000
4 5000
5 5000
6 5000
7 5000
8 5000
9 5000
10 5000
11 5000
12 5000
13 5000


### First 5 samples

In [None]:
tfds.as_dataframe(ds.take(5), info)

Unnamed: 0,content,label,title
0,b' Purkinje Incorporated pioneered pen computing for comprehensive clinical management of patients by physicians in 1991 with the PureMD ontology-anchored medical record was later rename Dossier that allowed physician order entry knowledge-based clinical decision support and billing from clinical note taking (e.g. medical history) recorded on a tablet computer.',0 (Company),b'Purkinje Incorporated'
1,b' The Alfa Romeo 33 (Type 905 and 907) is a small family car produced by the Italian automaker Alfa Romeo between 1983 and 1995. It was essentially an evolution of its predecessor the Alfasud which was based on the same floorplan chassis and mechanicals albeit with some minor modifications.',5 (MeanOfTransportation),b'Alfa Romeo 33'
2,b' \xc2\xa1Tr\xc3\xa9! is the eleventh studio album by the American punk rock band Green Day. It is the third and final installment in the \xc2\xa1Uno! \xc2\xa1Dos! \xc2\xa1Tr\xc3\xa9! trilogy a series of studio albums that were released from September to December 2012.',11 (Album),b'\xc2\xa1Tr\xc3\xa9!'
3,b' Paw\xc5\x82owice Gorzowskie [pavw\xc9\x94\xcb\x88vit\xcd\xa1s\xc9\x9b \xc9\xa1\xc9\x94\xcb\x88\xca\x90\xc9\x94fsk\xca\xb2\xc9\x9b] is a village in the administrative district of Gmina Gorz\xc3\xb3w \xc5\x9al\xc4\x85ski within Olesno County Opole Voivodeship in south-western Poland. It lies approximately 5 kilometres (3 mi) south-west of Gorz\xc3\xb3w \xc5\x9al\xc4\x85ski 15 km (9 mi) north of Olesno and 50 km (31 mi) north-east of the regional capital Opole.The village has a population of 480.',8 (Village),b'Paw\xc5\x82owice Gorzowskie'
4,b' Young and Beautiful (1934) is a romantic comedy film about a press agent who goes to great lengths to make his actress girlfriend a star only to risk losing her in the process. It stars William Haines and Judith Allen.',12 (Film),b'Young and Beautiful (film)'


### Get nth sample from each class

In [None]:
# df = tfds.as_dataframe(ds)
# df.to_csv(data_dir+"/DBpedia_train.csv")
# testdf = tfds.as_dataframe(testds)
# testdf.to_csv(data_dir+"/DBpedia_test.csv")

In [None]:
df = tfds.as_dataframe(ds.take(100))
df.to_csv(data_dir+"/DBpedia_subset.csv")

In [None]:
df = pd.read_csv(data_dir+"/DBpedia_subset.csv", header = 0)

In [None]:
df=df.groupby('label').nth(0).rename(columns={"Unnamed: 0":"Index"})
df["Class"] = classes
df.style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Index,content,title,Class
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,9,b' Cartridge World is an Ink and toner specialty retailer and franchisor. Cartridge World stores provide printing products (cartridges printers paper etc.) advice and service to home and business customers. Cartridge World stores also offer a cost-saving alternative to buying new OEM printer cartridges.The company is currently providing millions of replacement printer cartridges annually using state-of-the-art processes and high-performance ink and toner.',b'Cartridge World',Company
1,37,"b"" Islamic Azad University - Garmsar Branch is a branch of Iran's Islamic Azad Universities. It is situated in the city of Garmsar 82 kilometers southeast of Tehran.""",b'Islamic Azad University of Garmsar',EducationalInstitution
2,50,b' Albert Gilles (1895-1979) was a French coppersmith.',b'Albert Gilles',Artist
3,57,b' Mohammad Ashraful (Bengali: \xe0\xa6\xae\xe0\xa7\x8b\xe0\xa6\xb9\xe0\xa6\xbe\xe0\xa6\xae\xe0\xa7\x8d\xe0\xa6\xae\xe0\xa6\xa6 \xe0\xa6\x86\xe0\xa6\xb6\xe0\xa6\xb0\xe0\xa6\xbe\xe0\xa6\xab\xe0\xa7\x81\xe0\xa6\xb2) (born 7 July 1984) is a former Bangladeshi international cricket player who has represented the Bangladesh national cricket team. A top-order batsman with a penchant for flashy strokeplay he has also been selected to represent ACC Asia XI ODI side. Between 2007 and 2009 Ashraful captained his country in 13 Tests losing all but one which ended in a draw and 38 One Day Internationals (ODIs) of which Bangladesh won eight.',b'Mohammad Ashraful',Athlete
4,7,b' Shelley Mayer is a Democratic member of the New York State Assembly representing Assembly District 90 which includes the city of Yonkers. She was first elected on March 20 2012 in a special election to succeed Mike Spano and was re-elected in November 2012. Mayer was born and raised in Yonkers. Prior to her election to the Assembly Mayer was a Senior Counsel at the National State Attorney General Program at Columbia University where she focused on health care and labor law rights.',b'Shelley Mayer',OfficeHolder
5,11,b' The Wassmer WA-40 Super 4 Sancy is a French single-engined light aircraft of the 1960s and 70s. A single-engined low-winged monoplane with retractable nosewheel undercarriage variants include the more powerful WA 4/21 Prestige and the WA-41 Baladou with a fixed undercarriage.',b'Wassmer WA-40',MeanOfTransportation
6,19,b' Oulu City Hall (Finnish: Oulun kaupungintalo) is the seat for the municipal government of the City of Oulu Finland. It is located in the Pokkinen district of the central Oulu.The neo-renaissance style city hall was designed by a Swedish architect Johan Erik Stenberg as the restaurant and hotel Seurahuone in 1885. The third floor was added and other major changes made in 1920 according to plans of architect Oiva Kallio.',b'Oulu City Hall',Building
7,34,b' The Rivi\xc3\xa8re S\xc3\xa8che is a river of Martinique.',b'Rivi\xc3\xa8re S\xc3\xa8che',NaturalPlace
8,8,b' Karamjavan (Persian: \xd9\x83\xd8\xb1\xd9\x85\xd8\xac\xd9\x88\xd8\xa7\xd9\x86\xe2\x80\x8e also Romanized as Karamjav\xc4\x81n) is a village in Sarajuy-ye Sharqi Rural District Saraju District Maragheh County East Azerbaijan Province Iran. At the 2006 census its population was 2262 in 506 families.',b'Karamjavan',Village
9,39,b' Belemnia lydia is a moth of the Arctiidae family. It was described by Druce in 1896. It is found in Colombia.',b'Belemnia lydia',Animal
