In [None]:
# coding=utf-8
import pandas as pd
import time
import tiktoken
import requests  # Para fazer requisições à API da DeepSeek
from flask import Flask, request, jsonify
import os
from scipy.spatial.distance import cosine
import numpy as np
import ast

In [None]:
# Inicializa o Flask
app = Flask(__name__)

# Carrega os embeddings pré-treinados (se existirem)
dataframe = pd.DataFrame()
if 'training_embeddings.csv' in os.listdir():
    dataframe = pd.read_csv('training_embeddings.csv', index_col=0)
    def string_to_array(x):
        return np.array(ast.literal_eval(x))    
    dataframe['embedding'] = dataframe['embedding'].apply(string_to_array)

In [None]:
# Função para carregar a chave da API da DeepSeek
def get_deepseek_api_key():
    with open('deepseek_key.txt', 'r') as f:
        return f.read().strip()

# Carrega a chave da API da DeepSeek
deepseek_api_key = get_deepseek_api_key()
print('DeepSeek API key loaded')

In [None]:
# Função para fazer uma requisição à API da DeepSeek
def query_deepseek_api(prompt):
    url = "https://api.deepseek.com/v1/chat/completions"  # Endpoint da DeepSeek
    headers = {
        "Authorization": f"Bearer {deepseek_api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "deepseek-chat",  # Modelo da DeepSeek
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.6,
        "max_tokens": 150,
        "top_p": 0.7,
        "frequency_penalty": 0,
        "presence_penalty": 0,
        "stop": ['###']
    }
    response = requests.post(url, headers=headers, json=data)
    return response.json()

In [None]:
# Função para treinar o modelo
def train():
    filename = 'training.txt'
    print('Training with data from', filename)
    with open(filename, 'r') as f:
        data = f.read()
        data = data.lower().split('\n')
    
    dataframe = pd.DataFrame(data, columns=['text'])
    dataframe.to_csv('training.csv')
    print('Training data saved to training.csv')

    tokenizer = tiktoken.get_encoding("cl100k_base")
    dataframe = pd.read_csv('training.csv', index_col=0)
    dataframe.columns = ['text']

    dataframe['n_tokens'] = dataframe['text'].apply(lambda x: len(tokenizer.encode(str(x))))

    print('Training data loaded')

    max_tokens = 512

    def split_into_many(text, max_tokens = max_tokens):
        sentences = text.split(' ')
        n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]

        chunks = []
        tokens_so_far = 0
        chunk = []

        for sentence, token in zip(sentences, n_tokens):
            if tokens_so_far + token > max_tokens:
                chunks.append(". ".join(chunk)+ ".")
                chunk = []
                tokens_so_far = 0
        
            if token > max_tokens:
                continue

            chunk.append(sentence)
            tokens_so_far += token + 1
        
        return chunks

    shortened = []

    for row in dataframe.iterrows():
        if row[1]['text'] is None:
            continue
            
        if row[1]['n_tokens'] > max_tokens:
            shortened += split_into_many(row[1]['text'])
        else:
            shortened.append(row[1]['text'])

    dataframe = pd.DataFrame(shortened, columns=['text'])
    dataframe['n_tokens'] = dataframe['text'].apply(lambda x: len(tokenizer.encode(str(x))))

    total_tokens = dataframe['n_tokens'].sum()
    print('Total tokens:', total_tokens)
    print('total embedding cost:', total_tokens/1000 * 0.0001)

    i = 0
    embeddings = []
    for text in dataframe['text']:
        try:
            print('Creating embedding', i)
            embedding = openai.Embedding.create(input=text, engine='text-embedding-ada-002')['data'][0]['embedding']
            embeddings.append(embedding)
            time.sleep(1)
        except openai.error.RateLimitError:
            print('Rate limit reached, sleeping for 1 minute')
            time.sleep(20)
            embedding = openai.Embedding.create(input=text, engine='text-embedding-ada-002')['data'][0]['embedding']
            embeddings.append(embedding)
        i += 1
    
    print('Embeddings created')
    dataframe['embedding'] = embeddings
    dataframe.to_csv('training_embeddings.csv')
    print('Embeddings saved to training_embeddings.csv')
    print('Training complete')

In [1]:
# Função para criar contexto
def create_context(question, df, max_len=1800, size="ada"):
    print('Creating context for question:', question)
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
    df['distances'] = df['embedding'].apply(lambda x: cosine(q_embeddings, x))
    returns = []
    cur_len = 0
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        cur_len += row['n_tokens'] + 4
        if cur_len > max_len:
            break
        returns.append(row['text'])
    return "\n###\n".join(returns)

In [2]:
# Função para responder perguntas usando a API da DeepSeek
def answer(question):
    context = create_context(question, dataframe)
    prompt = f"Você é um assistente para o site da faculdade, seja profissional e responda as perguntas baseado no contexto abaixo, recuse educadamente caso não possa responder.\n{context}\n---\nPergunta: {question}\nResposta:"
    
    try:
        response = query_deepseek_api(prompt)
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(e)
        return 'Minha cabeça está doendo, não consigo responder a essa pergunta agora.'
        

In [None]:
# Endpoint para responder perguntas
@app.route('/answer', methods=['POST'])
def get_answer():
    question = request.json['question']
    return jsonify({'answer': answer(question)})

In [None]:
# Endpoint para treinar o modelo
@app.route('/train', methods=['POST'])
def train_model():
    train()
    return jsonify({'status': 'Training complete'})

In [None]:
# Inicia o servidor Flask
if __name__ == '__main__':
    app.run(host='0.0.0.0', port=2024)