In [1]:
import textract
import re
import pandas as pd

# Extract text from pdf using textract

In [2]:
text = textract.process("Q2.pdf").decode("utf-8")
text

'Service Diagnosis\r\n\r\nA1\r\nDescription\r\n\r\nINDOOR PCB ABNORMALITY\r\n\r\nPossible Root\r\ncause\r\n\r\n1. Faulty indoor PCB.\r\n2. Faulty connector connection at indoor.\r\n\r\nTroubleshooting\r\nTurn off unit.\r\n\r\nCheck indoor PCB connector\r\nconditions (including PCB to\r\nterminal block and all PCB wire\r\nconnector).\r\n\r\nAny sign\r\nof loose or\r\nabnormal.\r\nNo\r\nReplace indoor PCB and\r\noperate again.\r\n\r\n35\r\n\r\nYes\r\n\r\nConnect correctly and\r\noperate again.\r\n\r\n\x0cService Diagnosis\r\n\r\nA5\r\nDescription\r\n\r\nANTIFREEZE PROTECTION OR HIGH PRESSURE CONTROL\r\n\r\nPossible Root\r\ncause\r\n\r\n1.\r\n2.\r\n3.\r\n4.\r\n5.\r\n\r\nIndoor air short circuit.\r\nIndoor coil thermistor faulty.\r\nIndoor PCB faulty.\r\nFan blower dirty.\r\n\r\nTroubleshooting\r\nCheck indoor air flow.\r\n\r\nAny air short\r\ncircuit?\r\n\r\nYes\r\n\r\nProvide sufficient air passage.\r\n\r\nNo\r\nCheck intake air filter.\r\n\r\nIs it very dirty?\r\n\r\nYes\r\n\r\nClean th

# Some adjustment

In [3]:
text = '\x0c' + text
text = re.sub(r'\r', '', text)
text = re.sub(r'\x0c', '\n>>>\n', text) # indicate a different page
text = re.sub(r':', '\n', text)
lines = text.splitlines()
while("" in lines) : 
	lines.remove("") 

In [4]:
for line in lines:
    print(line)

>>>
Service Diagnosis
A1
Description
INDOOR PCB ABNORMALITY
Possible Root
cause
1. Faulty indoor PCB.
2. Faulty connector connection at indoor.
Troubleshooting
Turn off unit.
Check indoor PCB connector
conditions (including PCB to
terminal block and all PCB wire
connector).
Any sign
of loose or
abnormal.
No
Replace indoor PCB and
operate again.
35
Yes
Connect correctly and
operate again.
>>>
Service Diagnosis
A5
Description
ANTIFREEZE PROTECTION OR HIGH PRESSURE CONTROL
Possible Root
cause
1.
2.
3.
4.
5.
Indoor air short circuit.
Indoor coil thermistor faulty.
Indoor PCB faulty.
Fan blower dirty.
Troubleshooting
Check indoor air flow.
Any air short
circuit?
Yes
Provide sufficient air passage.
No
Check intake air filter.
Is it very dirty?
Yes
Clean the air filter.
No
Check the dust
accumulate indoor coil.
Is it very dirty?
Yes
Clean the indoor coil.
No
Check fan blower condition.
Is it very dirty?
Yes
Clean fan blower.
No
Check indoor coil
thermistor resistance.
Does it conform
to the t

# Add the page number to the first item of each page

In [5]:
idx_page = 0
page = 0
for idx, line in enumerate(lines):
    if line==">>>":
        try:
            for i in range(idx, 0, -1):
                if lines[i].isnumeric():
                    print(page, '. ', lines[i], sep='')
                    lines[idx_page]+=lines[i]
                    idx_page = idx
                    page+=1
                    break
        except:
            pass

0. 35
1. 36
2. 37
3. 38
4. 39
5. 40
6. 41
7. 42
8. 43
9. 44
10. 45
11. 46
12. 47
13. 48
14. 49
15. 50
16. 51
17. 52
18. 53
19. 54
20. 55
21. 56
22. 57
23. 58
24. 59
25. 60


# Get the datas

In [6]:
page_now = 0 
datas = []
searchCause = False
data = {"Description":'', "Possible_Root_cause":[], "Page_number":''}
for idx, line in enumerate(lines):
    if ">>>" in line:
        page_now = line[3:]
    elif line == "Description":
        data["Description"] = lines[idx+1]
    elif line == "Possible Root":
        searchCause=True
    elif line == "Troubleshooting":
        searchCause = False
        data["Page_number"] = page_now
        datas.append(data)
        data = {"Description":'', "Possible_Root_cause":[], "Page_number":''}
    elif searchCause and line != 'cause':
        data["Possible_Root_cause"].append(line)
datas

[{'Description': 'INDOOR PCB ABNORMALITY',
  'Possible_Root_cause': ['1. Faulty indoor PCB.',
   '2. Faulty connector connection at indoor.'],
  'Page_number': '35'},
 {'Description': 'ANTIFREEZE PROTECTION OR HIGH PRESSURE CONTROL',
  'Possible_Root_cause': ['1.',
   '2.',
   '3.',
   '4.',
   '5.',
   'Indoor air short circuit.',
   'Indoor coil thermistor faulty.',
   'Indoor PCB faulty.',
   'Fan blower dirty.'],
  'Page_number': '36'},
 {'Description': 'INDOOR FAN MOTOR ABNORMALITY',
  'Possible_Root_cause': ['1. Indoor fan motor winding short, or the motor lead wire broken.',
   '2. Indoor PCB faulty.'],
  'Page_number': '37'},
 {'Description': 'INDOOR HEAT EXCHANGER THERMISTOR ABNORMALITY',
  'Possible_Root_cause': ['1. Thermistor, connector faulty.',
   '2. Indoor PCB faulty.'],
  'Page_number': '37'},
 {'Description': 'INDOOR ROOM THERMISTOR ABNORMALITY',
  'Possible_Root_cause': ['1. Thermistor, connector faulty.',
   '2. Indoor PCB faulty.'],
  'Page_number': '38'},
 {'Descr

# Filtering unwanted text and tidy up the output

In [7]:
for data in datas:
    new = []
    for i, each in enumerate(data["Possible_Root_cause"]):
        stripped = re.sub("([0-9]+.)", "", each).strip()
        if len(stripped)>0:
            try:
                if (stripped[0].isupper() or stripped[0].isdigit()) and new[-1][-1]=='.':
                    new.append(stripped)
                else:
                    print('>>>', data["Page_number"])
                    new[-1] += ' ' + stripped
            except:
                new.append(stripped)
    data["Possible_Root_cause"] = new
datas

>>> 39
>>> 40
>>> 40
>>> 41
>>> 45
>>> 53
>>> 57
>>> 58
>>> 59


[{'Description': 'INDOOR PCB ABNORMALITY',
  'Possible_Root_cause': ['Faulty indoor PCB.',
   'Faulty connector connection at indoor.'],
  'Page_number': '35'},
 {'Description': 'ANTIFREEZE PROTECTION OR HIGH PRESSURE CONTROL',
  'Possible_Root_cause': ['Indoor air short circuit.',
   'Indoor coil thermistor faulty.',
   'Indoor PCB faulty.',
   'Fan blower dirty.'],
  'Page_number': '36'},
 {'Description': 'INDOOR FAN MOTOR ABNORMALITY',
  'Possible_Root_cause': ['Indoor fan motor winding short, or the motor lead wire broken.',
   'Indoor PCB faulty.'],
  'Page_number': '37'},
 {'Description': 'INDOOR HEAT EXCHANGER THERMISTOR ABNORMALITY',
  'Possible_Root_cause': ['Thermistor, connector faulty.',
   'Indoor PCB faulty.'],
  'Page_number': '37'},
 {'Description': 'INDOOR ROOM THERMISTOR ABNORMALITY',
  'Possible_Root_cause': ['Thermistor, connector faulty.',
   'Indoor PCB faulty.'],
  'Page_number': '38'},
 {'Description': 'OUTDOOR PCB ABNORMALITY',
  'Possible_Root_cause': ['Micro 

# Change to dataframe and convert it to csv file

In [11]:
df = pd.DataFrame(datas)
df.to_csv("Q2.csv", index=False)
df

Unnamed: 0,Description,Possible_Root_cause,Page_number
0,INDOOR PCB ABNORMALITY,"[Faulty indoor PCB., Faulty connector connecti...",35
1,ANTIFREEZE PROTECTION OR HIGH PRESSURE CONTROL,"[Indoor air short circuit., Indoor coil thermi...",36
2,INDOOR FAN MOTOR ABNORMALITY,"[Indoor fan motor winding short, or the motor ...",37
3,INDOOR HEAT EXCHANGER THERMISTOR ABNORMALITY,"[Thermistor, connector faulty., Indoor PCB fau...",37
4,INDOOR ROOM THERMISTOR ABNORMALITY,"[Thermistor, connector faulty., Indoor PCB fau...",38
5,OUTDOOR PCB ABNORMALITY,[Micro Controller program run-away due to exte...,39
6,COMPRESSOR OVERLOAD,"[Refrigerant Shortage. way valve malfunction.,...",40
7,COMPRESSOR OVERLOAD,"[Refrigerant Shortage. way valve malfunction.,...",41
8,COMPRESSOR LOCK/START-UP ABNORMALITY,"[Compressor locked., Compressor harness discon...",42
9,OUTDOOR FAN MOTOR LOCK,"[Fan motor breakdown., Harness or connector di...",43


In [12]:
df = pd.read_csv("Q2.csv")
df.head()

Unnamed: 0,Description,Possible_Root_cause,Page_number
0,INDOOR PCB ABNORMALITY,"['Faulty indoor PCB.', 'Faulty connector conne...",35
1,ANTIFREEZE PROTECTION OR HIGH PRESSURE CONTROL,"['Indoor air short circuit.', 'Indoor coil the...",36
2,INDOOR FAN MOTOR ABNORMALITY,"['Indoor fan motor winding short, or the motor...",37
3,INDOOR HEAT EXCHANGER THERMISTOR ABNORMALITY,"['Thermistor, connector faulty.', 'Indoor PCB ...",37
4,INDOOR ROOM THERMISTOR ABNORMALITY,"['Thermistor, connector faulty.', 'Indoor PCB ...",38
