In [None]:
def lines_to_array(file_name: str):
    try:
        with open(file_name) as file:
            lines = file.readlines()
    except UnicodeDecodeError:
        with open(file_name, encoding="cp1251") as file:
            lines = file.readlines()
    return [s.replace("\n", "") for s in lines]


In [None]:
morph = pymorphy2.MorphAnalyzer()
def filter_normalize(text):
    return ' '.join(
        list(map(lambda l: morph.parse(l)[0].normal_form, 
        re.sub(r"[\n\r\t]", ' ', text).split(" ")
                )))

In [352]:
positive_words = lines_to_array("./trash/pos.txt")
s_positive_words = lines_to_array("./trash/strong_pos.txt")
negative_words = lines_to_array("./trash/neg.txt")
s_negative_words = lines_to_array("./trash/strong_neg.txt")
words_by_sentiment = {"positive": list(map(lambda w: filter_normalize(w), positive_words)),
                      "strongly_positive": list(map(lambda w: filter_normalize(w), s_positive_words)),
                      "negative": list(map(lambda w: filter_normalize(w), negative_words)),
                      "strongly_negative": list(map(lambda w: filter_normalize(w), s_negative_words))}

In [None]:
positive_words = lines_to_array("./trash/positive.txt")
s_positive_words = lines_to_array("./trash/strongly_positive.txt")
negative_words = lines_to_array("./trash/negative.txt")
s_negative_words = lines_to_array("./trash/strongly_negative.txt")
words_by_sentiment = {"positive": list(map(lambda w: filter_normalize(w), positive_words)),
                      "strongly_positive": list(map(lambda w: filter_normalize(w), s_positive_words)),
                      "negative": list(map(lambda w: filter_normalize(w), negative_words)),
                      "strongly_negative": list(map(lambda w: filter_normalize(w), s_negative_words))}
sentiment_duplicates = get_sentiment_duplicates(words_by_sentiment)

In [None]:
recent_words = lines_to_array("./trash/words_overall.txt")
recent_words = list(map(lambda w: filter_normalize(w), recent_words))
recent_words

In [407]:
reports_text, normalized_reports = load_reports("./reports")
sentiment_results = {}
counting_results = {}
words_charact = {}
for rep, text in normalized_reports.items():
    sentiment_results[rep] = sentiment_analysis(text, words_by_sentiment, sentiment_duplicates)
    counting_results[rep] = count_words(text, overall_words)
    text_len, avg_w_len = words_characteristics(text)
    words_charact[rep] = {"Длина в тексте": text_len,
                          "Средняя длина слова": avg_w_len}

In [431]:
def load_reports(directory):
    normalized_reports = {}
    reports_text = {}
    print(directory)
    for file in os.listdir(directory):
        full_path = f"{directory}/{file}"
        if os.path.isfile(full_path) and full_path.endswith(".txt"):
            with open(full_path) as current_text:
                print(f"Processing of {file} has been started")
                reports_text[file] = re.sub(r"[^а-яА-Я]", ' ', ''.join(current_text.readlines())).lower()
                normalized_reports[file] = filter_normailize(reports_text[file])
    return normalized_reports, reports_text

In [406]:
garbage_words = {"годовой отчет", "год", "также", "быть", "при", "который", "тот", "или"}

def words_characteristics(text):
    overall_len = 0
    for sw in garbage_words:
        rep = text.replace(sw, '')
    splitted_filtered = list(filter(lambda s: len(s) > 2, text.split(" ")))
    text_length = len(splitted_filtered)
    for wrd in splitted_filtered:
        overall_len += len(wrd)
    return text_length, int(overall_len / (1 if text_length == 0 else text_length))

def sentiment_analysis(text, words_by_sentiment, duplicates):
    sent_counts = {sentiment: 0 for sentiment in words_by_sentiment.keys()}
    for sent, words in words_by_sentiment.items():
        for w in words:
            sent_counts[sent] += text.count(w)
            if w in duplicates.keys():
                sent_counts[sent] -= duplicates[w]
    return sent_counts

def get_sentiment_duplicates(sentiment_words):
    strongly_pos = set(sentiment_words['strongly_positive'])
    pos = set(sentiment_words['positive'])
    all_negative = ','.join(sentiment_words['strongly_negative'] + sentiment_words['negative'])
    occured_words = {}
    for w in strongly_pos.union(pos):
        occurences = len(re.findall(fr"не( )?{w}", all_negative))
        if occurences != 0:
            if len(w.split(" ")) == 2 and len(w.split(" ")[0]) == 2:
                occured_words[f"не {w}"] = occurences
            else:
                occured_words[f"не{w}"] = occurences
    return occured_words
            
def count_words(text: str, words_to_count: list):
    counted = {word: 0 for word in words_to_count}
    for w in counted.keys():
        counted[w] += len(re.findall(fr"\b{w}\b", text))
    return counted

In [135]:
pd.DataFrame(companies_words_count2).to_csv("counted_no_duplicates.csv")

In [58]:
pd.DataFrame(companies_words_count).to_csv("words_counted.csv")

In [329]:
wkeys = list(overall_words)
recent_words.sort(key=lambda l: len(l))
matched_words = defaultdict(set)
for w in wkeys:
    for wi in range(wkeys.index(w) + 1, len(wkeys)):
        if w in wkeys[wi].strip() and w != wkeys[wi]:
            matched_words[w].add(wkeys[wi])

In [None]:
sentiment_results

In [331]:
companies_words_count = copy.deepcopy(counting_results)

In [332]:
for words_counts in companies_words_count.values():
    for word, matches in sorted(matched_words.items(), key = lambda k: len(k[0]), reverse = True):
        for w in matches:
            words_counts[word] -= words_counts[w]

In [None]:
companies_words_count

In [256]:

with open("trash/eco_words.txt") as eco:
    eco_words = {filter_normailize(s.replace("\n", "")) for s in eco.readlines()}

with open("trash/soc_words.txt") as soc:
    soc_words = {filter_normailize(s.replace("\n", "")) for s in soc.readlines()}
    
with open("trash/corp.txt") as corp:
    corp_words = {filter_normailize(s.replace("\n", "")) for s in corp.readlines()}

overall_words = eco_words.union(soc_words).union(corp_words)

In [276]:

with open("trash/ecologyv2.txt", encoding="cp1251") as eco:
    eco_words = {filter_normailize(s.replace("\n", "")) for s in eco.readlines()}

with open("trash/socialv2.txt", encoding="cp1251") as soc:
    soc_words = {filter_normailize(s.replace("\n", "")) for s in soc.readlines()}
    
with open("trash/corp_govv2.txt", encoding="cp1251") as corp:
    corp_words = {filter_normailize(s.replace("\n", "")) for s in corp.readlines()}
    
overall_words = eco_words.union(soc_words).union(corp_words)
overall_words.remove("")


In [308]:
with open("trash/ecologyv3.txt", encoding="cp1251") as eco:
    eco_words = {filter_normailize(s.replace("\n", "")) for s in eco.readlines()}

with open("trash/socialv3.txt", encoding="cp1251") as soc:
    soc_words = {filter_normailize(s.replace("\n", "")) for s in soc.readlines()}
    
with open("trash/corp_govv3.txt", encoding="cp1251") as corp:
    corp_words = {filter_normailize(s.replace("\n", "")) for s in corp.readlines()}
    
overall_words = eco_words.union(soc_words).union(corp_words)
#overall_words.remove("")

In [334]:
eco_comp_words = {}
soc_comp_words = {}
corp_comp_words = {}

for k, words_counts in companies_words_count.items():
    eco_comp_words[k] = {eco_key: companies_words_count[k].get(eco_key, 0) for eco_key in eco_words}
    soc_comp_words[k] = {eco_key: companies_words_count[k].get(eco_key, 0) for eco_key in soc_words}
    corp_comp_words[k] = {eco_key: companies_words_count[k].get(eco_key, 0) for eco_key in corp_words}


In [335]:
pd.DataFrame(eco_comp_words).to_csv("eco_comp_words.csv", encoding="utf-8")
pd.DataFrame(soc_comp_words).to_csv("soc_comp_words.csv", encoding="utf-8")
pd.DataFrame(corp_comp_words).to_csv("corp_comp_words.csv", encoding="utf-8")
pd.DataFrame(companies_reports).to_csv("companies_reports.csv", encoding="utf-8")

In [None]:
eco_comp_words

In [409]:
pd.DataFrame(sentiment_results).to_csv("sentiment_results.csv", encoding="utf-8")

In [246]:
pd.DataFrame(words_charact).to_csv("words_char.csv", encoding="utf-8")

In [None]:
counting_results

In [None]:
eco_comp_words