# Design Thinking for Document Data Exploration from the Prototype code
**Group 6-Members**
* 110C51505 張仁程
* 110C51537 王星文
* 110C51522 謝安
* 110C51526	蔡侑芯
* 110C51533	陳毓庭
* 110C51520	楊文均

In [None]:
#如果在Windows或Linux環境上請跑此段。
#請先pip install gdown.
import gdown

url = 'https://drive.google.com/uc?export=download&id=10G2D9fm3UL92W2UuKdrw4bzJkDu0OflJ'
output = '/Code/NTUTDS/patent_documents/patentdoc.tar.gz'
#請使用自己的path
gdown.download(url, output, quiet=False)

In [21]:
#如果在Windows或Linux環境上請跑此段。
import os
import tarfile

def tar_extract(file_path):
    tar = tarfile.open(file_path, 'r:gz')
    tar.extractall()
    tar.close()

if __name__ == '__main__':
    file_path = '/Code/NTUTDS/patent_documents/patentdoc.tar.gz'
    tar_extract(file_path)
#請使用自己的path

In [None]:
#老師原始程式碼，在Colab上請跑此段。
#!wget "https://drive.google.com/uc?export=download&id=10G2D9fm3UL92W2UuKdrw4bzJkDu0OflJ" -O patentdoc.tar.gz
#!tar xvfz patentdoc.tar.gz

In [None]:
#資料抓取與處理-在Pandas做merge
import pandas as pd
from bs4 import BeautifulSoup
path = "patent_documents"
patent_csv = {}
for filename in os.listdir(path):
  if filename.endswith(".txt"):
    with open(os.path.join(path, filename), 'r') as f:
      soup = BeautifulSoup(f.read(), 'xml')
      patent_number = soup.find("publication-reference").findChild("doc-number").getText()
      pub_date = soup.find("publication-reference").findChild("date").getText()
      country = soup.find("publication-reference").findChild("country").getText()
      app_ref = soup.find("application-reference") #這個不要輸出
      app_type = app_ref['appl-type']
      app_date = app_ref.findChild("date").getText()
      patent_title = soup.find("invention-title").getText()
      patent_abstract = soup.find("abstract").getText()
      #Agents = soup.find("orgname").getText() #如果文件沒有orgname這邊會return error
      us_applicants = soup.find("us-applicants") #這個不要輸出
      app_firstname = us_applicants.findChild("first-name").contents
      app_lastname = us_applicants.findChild("last-name").contents
      applicant_name = ' '.join([a+b for a,b in zip(app_firstname,app_lastname)]) #記得之後要解決a跟b中間沒空格的問題
      inventors = soup.find("inventors") #這個不要輸出
      inv_firstname = inventors.findChild("first-name").contents
      inv_lastname = inventors.findChild("last-name").contents
      inv_name = ' '.join([a+b for a,b in zip(inv_firstname,inv_lastname)]) #記得之後要解決a跟b中間沒空格的問題
      ipcr_ref = soup.find("classification-ipcr") #這個不要輸出
      ipcr_section = ipcr_ref.findChild("section").contents
      ipcr_class = ipcr_ref.findChild("class").contents
      for class_num in ipcr_class:
            if len(str(class_num))==1:
                  class_num = str("0") + class_num
      ipcr_subclass = ipcr_ref.findChild("subclass").contents
      ipc_list = ' '.join([d+e+f for d,e,f in zip(ipcr_section,ipcr_class,ipcr_subclass)])
      patent_csv = {"Patent Number":patent_number,"Publication Date":pub_date,"Application Date":app_date,"Country":country,"Application Type":app_type,"Title":patent_title.replace('\n',' '),"Abstract":patent_abstract.replace('\n',' '),"Applicant":applicant_name,"Inventor":inv_name,"IPCs":ipc_list}
      
      dfs = pd.DataFrame.from_dict(patent_csv,orient='index').T
  




In [None]:
#資料抓取與處理2-在dict做merge
import pandas as pd
from bs4 import BeautifulSoup
path = "patent_documents"
patent_csv = {}
for filename in os.listdir(path):
  if filename.endswith(".txt"):
    with open(os.path.join(path, filename), 'r') as f:
      soup = BeautifulSoup(f.read(), 'xml')
      patent_number = soup.find("publication-reference").findChild("doc-number").getText()
      pub_date = soup.find("publication-reference").findChild("date").getText()
      
      
patent_csv = {"Patent Number":patent_number,"Publication Date":pub_date}
      


In [None]:
#Deal only claim texts
import pprint
path = "patent_documents"
patent_docs_txt = {}
for filename in os.listdir(path):
  if filename.endswith(".txt"):
    with open(os.path.join(path, filename), 'r') as f:
      soup = BeautifulSoup(f.read(), 'xml')
      tag = soup.find_all("claims")
      patent_docs_txt[filename] = tag[0].text.lower().replace('\n',' ')

pprint.pprint(patent_docs_txt, depth=1, width=10000)

In [None]:
import nltk
import re

print(nltk.__version__)
nltk.download('punkt')
nltk.download('stopwords')
nltk_stopwords = nltk.corpus.stopwords.words('english')

def Patent_KW50(patent_text):
  tokens = nltk.word_tokenize(patent_text)
  fdist = nltk.FreqDist(tokens)
  common50 = fdist.most_common(50)
  common50_new1 = [(k,v) for k,v in common50 if not re.fullmatch(r"^.$", k)]
  common50_new2 = [(k,v) for k,v in common50_new1 if re.fullmatch(r"^[a-z]+$", k, flags=re.IGNORECASE)]
  stopword = list(nltk_stopwords)[:100]
  common50_new3 = dict([(k,v) for k,v in common50_new2 if not k in stopword])
  return common50_new3

patent_docs_KW = {}
for doc_name, txt in patent_docs_txt.items():
  patent_docs_KW[doc_name] = Patent_KW50(txt)

pprint.pprint(patent_docs_KW, width=1000)

In [None]:
import numpy as np 
import matplotlib.pyplot as plt
from itertools import islice

plt.clf()

for doc_name, KW_stat in patent_docs_KW.items():
  common20 = dict(islice(KW_stat.items(), 20))
  print(common20)
  x = np.arange(20)
  y = common20.values()
  plt.title( doc_name, fontsize=8, color='g')
  plt.xticks(x, common20.keys(),  rotation='vertical')
  plt.bar(x, y, color='y')
  plt.show()